tomoto 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (420) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +123 -0
  5. data/ext/tomoto/ext.cpp +245 -0
  6. data/ext/tomoto/extconf.rb +28 -0
  7. data/lib/tomoto.rb +12 -0
  8. data/lib/tomoto/ct.rb +11 -0
  9. data/lib/tomoto/hdp.rb +11 -0
  10. data/lib/tomoto/lda.rb +67 -0
  11. data/lib/tomoto/version.rb +3 -0
  12. data/vendor/EigenRand/EigenRand/Core.h +1139 -0
  13. data/vendor/EigenRand/EigenRand/Dists/Basic.h +111 -0
  14. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +877 -0
  15. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +108 -0
  16. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +626 -0
  17. data/vendor/EigenRand/EigenRand/EigenRand +19 -0
  18. data/vendor/EigenRand/EigenRand/Macro.h +24 -0
  19. data/vendor/EigenRand/EigenRand/MorePacketMath.h +978 -0
  20. data/vendor/EigenRand/EigenRand/PacketFilter.h +286 -0
  21. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +624 -0
  22. data/vendor/EigenRand/EigenRand/RandUtils.h +413 -0
  23. data/vendor/EigenRand/EigenRand/doc.h +220 -0
  24. data/vendor/EigenRand/LICENSE +21 -0
  25. data/vendor/EigenRand/README.md +288 -0
  26. data/vendor/eigen/COPYING.BSD +26 -0
  27. data/vendor/eigen/COPYING.GPL +674 -0
  28. data/vendor/eigen/COPYING.LGPL +502 -0
  29. data/vendor/eigen/COPYING.MINPACK +52 -0
  30. data/vendor/eigen/COPYING.MPL2 +373 -0
  31. data/vendor/eigen/COPYING.README +18 -0
  32. data/vendor/eigen/Eigen/CMakeLists.txt +19 -0
  33. data/vendor/eigen/Eigen/Cholesky +46 -0
  34. data/vendor/eigen/Eigen/CholmodSupport +48 -0
  35. data/vendor/eigen/Eigen/Core +537 -0
  36. data/vendor/eigen/Eigen/Dense +7 -0
  37. data/vendor/eigen/Eigen/Eigen +2 -0
  38. data/vendor/eigen/Eigen/Eigenvalues +61 -0
  39. data/vendor/eigen/Eigen/Geometry +62 -0
  40. data/vendor/eigen/Eigen/Householder +30 -0
  41. data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
  42. data/vendor/eigen/Eigen/Jacobi +33 -0
  43. data/vendor/eigen/Eigen/LU +50 -0
  44. data/vendor/eigen/Eigen/MetisSupport +35 -0
  45. data/vendor/eigen/Eigen/OrderingMethods +73 -0
  46. data/vendor/eigen/Eigen/PaStiXSupport +48 -0
  47. data/vendor/eigen/Eigen/PardisoSupport +35 -0
  48. data/vendor/eigen/Eigen/QR +51 -0
  49. data/vendor/eigen/Eigen/QtAlignedMalloc +40 -0
  50. data/vendor/eigen/Eigen/SPQRSupport +34 -0
  51. data/vendor/eigen/Eigen/SVD +51 -0
  52. data/vendor/eigen/Eigen/Sparse +36 -0
  53. data/vendor/eigen/Eigen/SparseCholesky +45 -0
  54. data/vendor/eigen/Eigen/SparseCore +69 -0
  55. data/vendor/eigen/Eigen/SparseLU +46 -0
  56. data/vendor/eigen/Eigen/SparseQR +37 -0
  57. data/vendor/eigen/Eigen/StdDeque +27 -0
  58. data/vendor/eigen/Eigen/StdList +26 -0
  59. data/vendor/eigen/Eigen/StdVector +27 -0
  60. data/vendor/eigen/Eigen/SuperLUSupport +64 -0
  61. data/vendor/eigen/Eigen/UmfPackSupport +40 -0
  62. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +673 -0
  63. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +542 -0
  64. data/vendor/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
  65. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +639 -0
  66. data/vendor/eigen/Eigen/src/Core/Array.h +329 -0
  67. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +226 -0
  68. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +209 -0
  69. data/vendor/eigen/Eigen/src/Core/Assign.h +90 -0
  70. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +935 -0
  71. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +178 -0
  72. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +353 -0
  73. data/vendor/eigen/Eigen/src/Core/Block.h +452 -0
  74. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +164 -0
  75. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +160 -0
  76. data/vendor/eigen/Eigen/src/Core/ConditionEstimator.h +175 -0
  77. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +1688 -0
  78. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +127 -0
  79. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +184 -0
  80. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +866 -0
  81. data/vendor/eigen/Eigen/src/Core/CwiseTernaryOp.h +197 -0
  82. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +103 -0
  83. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +128 -0
  84. data/vendor/eigen/Eigen/src/Core/DenseBase.h +611 -0
  85. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +681 -0
  86. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +570 -0
  87. data/vendor/eigen/Eigen/src/Core/Diagonal.h +260 -0
  88. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +343 -0
  89. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +28 -0
  90. data/vendor/eigen/Eigen/src/Core/Dot.h +318 -0
  91. data/vendor/eigen/Eigen/src/Core/EigenBase.h +159 -0
  92. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +146 -0
  93. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +155 -0
  94. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +455 -0
  95. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +593 -0
  96. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +187 -0
  97. data/vendor/eigen/Eigen/src/Core/IO.h +225 -0
  98. data/vendor/eigen/Eigen/src/Core/Inverse.h +118 -0
  99. data/vendor/eigen/Eigen/src/Core/Map.h +171 -0
  100. data/vendor/eigen/Eigen/src/Core/MapBase.h +303 -0
  101. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +1415 -0
  102. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +101 -0
  103. data/vendor/eigen/Eigen/src/Core/Matrix.h +459 -0
  104. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +529 -0
  105. data/vendor/eigen/Eigen/src/Core/NestByValue.h +110 -0
  106. data/vendor/eigen/Eigen/src/Core/NoAlias.h +108 -0
  107. data/vendor/eigen/Eigen/src/Core/NumTraits.h +248 -0
  108. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +633 -0
  109. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +1035 -0
  110. data/vendor/eigen/Eigen/src/Core/Product.h +186 -0
  111. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +1112 -0
  112. data/vendor/eigen/Eigen/src/Core/Random.h +182 -0
  113. data/vendor/eigen/Eigen/src/Core/Redux.h +505 -0
  114. data/vendor/eigen/Eigen/src/Core/Ref.h +283 -0
  115. data/vendor/eigen/Eigen/src/Core/Replicate.h +142 -0
  116. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +117 -0
  117. data/vendor/eigen/Eigen/src/Core/Reverse.h +211 -0
  118. data/vendor/eigen/Eigen/src/Core/Select.h +162 -0
  119. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +352 -0
  120. data/vendor/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
  121. data/vendor/eigen/Eigen/src/Core/Solve.h +188 -0
  122. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +235 -0
  123. data/vendor/eigen/Eigen/src/Core/SolverBase.h +130 -0
  124. data/vendor/eigen/Eigen/src/Core/StableNorm.h +221 -0
  125. data/vendor/eigen/Eigen/src/Core/Stride.h +111 -0
  126. data/vendor/eigen/Eigen/src/Core/Swap.h +67 -0
  127. data/vendor/eigen/Eigen/src/Core/Transpose.h +403 -0
  128. data/vendor/eigen/Eigen/src/Core/Transpositions.h +407 -0
  129. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +983 -0
  130. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +96 -0
  131. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +695 -0
  132. data/vendor/eigen/Eigen/src/Core/Visitor.h +273 -0
  133. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +451 -0
  134. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +439 -0
  135. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +637 -0
  136. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +51 -0
  137. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +391 -0
  138. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1316 -0
  139. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +430 -0
  140. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +322 -0
  141. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +1061 -0
  142. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +103 -0
  143. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +674 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +91 -0
  145. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +333 -0
  146. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +1124 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +212 -0
  148. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +29 -0
  149. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +49 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +490 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +91 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +760 -0
  153. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +471 -0
  154. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +562 -0
  155. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +895 -0
  156. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +77 -0
  157. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +397 -0
  158. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +137 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +945 -0
  160. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +168 -0
  161. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +475 -0
  162. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +188 -0
  163. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +136 -0
  164. data/vendor/eigen/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
  165. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +792 -0
  166. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2156 -0
  167. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +492 -0
  168. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +311 -0
  169. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
  170. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +122 -0
  171. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +619 -0
  172. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
  173. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +163 -0
  174. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +521 -0
  175. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +287 -0
  176. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +260 -0
  177. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
  178. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
  179. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
  180. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +466 -0
  181. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +315 -0
  182. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
  183. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
  184. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +335 -0
  185. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +163 -0
  186. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +145 -0
  187. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +398 -0
  188. data/vendor/eigen/Eigen/src/Core/util/Constants.h +547 -0
  189. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +83 -0
  190. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
  191. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +130 -0
  192. data/vendor/eigen/Eigen/src/Core/util/Macros.h +1001 -0
  193. data/vendor/eigen/Eigen/src/Core/util/Memory.h +993 -0
  194. data/vendor/eigen/Eigen/src/Core/util/Meta.h +534 -0
  195. data/vendor/eigen/Eigen/src/Core/util/NonMPL2.h +3 -0
  196. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +27 -0
  197. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +218 -0
  198. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +821 -0
  199. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
  200. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +459 -0
  201. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
  202. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
  203. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
  204. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
  205. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
  206. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
  207. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +654 -0
  208. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +546 -0
  209. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
  210. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +870 -0
  211. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
  212. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +556 -0
  213. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +392 -0
  214. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +247 -0
  215. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +114 -0
  216. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +497 -0
  217. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +282 -0
  218. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +234 -0
  219. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +195 -0
  220. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +814 -0
  221. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +199 -0
  222. data/vendor/eigen/Eigen/src/Geometry/RotationBase.h +206 -0
  223. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +170 -0
  224. data/vendor/eigen/Eigen/src/Geometry/Transform.h +1542 -0
  225. data/vendor/eigen/Eigen/src/Geometry/Translation.h +208 -0
  226. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +166 -0
  227. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +161 -0
  228. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +103 -0
  229. data/vendor/eigen/Eigen/src/Householder/Householder.h +172 -0
  230. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +470 -0
  231. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
  232. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +228 -0
  233. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +246 -0
  234. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +400 -0
  235. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +462 -0
  236. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +394 -0
  237. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +216 -0
  238. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +115 -0
  239. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +462 -0
  240. data/vendor/eigen/Eigen/src/LU/Determinant.h +101 -0
  241. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +891 -0
  242. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +415 -0
  243. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +611 -0
  244. data/vendor/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
  245. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +338 -0
  246. data/vendor/eigen/Eigen/src/MetisSupport/MetisSupport.h +137 -0
  247. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +445 -0
  248. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +1843 -0
  249. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +157 -0
  250. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
  251. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +543 -0
  252. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +653 -0
  253. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
  254. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +562 -0
  255. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +676 -0
  256. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +409 -0
  257. data/vendor/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
  258. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +313 -0
  259. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +1246 -0
  260. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +804 -0
  261. data/vendor/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
  262. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +315 -0
  263. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
  264. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +689 -0
  265. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
  266. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +377 -0
  267. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +258 -0
  268. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
  269. data/vendor/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
  270. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +216 -0
  271. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +603 -0
  272. data/vendor/eigen/Eigen/src/SparseCore/SparseColEtree.h +206 -0
  273. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +341 -0
  274. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +726 -0
  275. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +148 -0
  276. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +320 -0
  277. data/vendor/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
  278. data/vendor/eigen/Eigen/src/SparseCore/SparseDot.h +98 -0
  279. data/vendor/eigen/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
  280. data/vendor/eigen/Eigen/src/SparseCore/SparseMap.h +305 -0
  281. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +1403 -0
  282. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +405 -0
  283. data/vendor/eigen/Eigen/src/SparseCore/SparsePermutation.h +178 -0
  284. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +169 -0
  285. data/vendor/eigen/Eigen/src/SparseCore/SparseRedux.h +49 -0
  286. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +397 -0
  287. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +656 -0
  288. data/vendor/eigen/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
  289. data/vendor/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
  290. data/vendor/eigen/Eigen/src/SparseCore/SparseTranspose.h +92 -0
  291. data/vendor/eigen/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
  292. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +178 -0
  293. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +478 -0
  294. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +253 -0
  295. data/vendor/eigen/Eigen/src/SparseCore/TriangularSolver.h +315 -0
  296. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +773 -0
  297. data/vendor/eigen/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
  298. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
  299. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
  300. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +301 -0
  301. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
  302. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
  303. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
  304. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
  305. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
  306. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
  307. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
  308. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
  309. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
  310. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
  311. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
  312. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
  313. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +745 -0
  314. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +126 -0
  315. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +106 -0
  316. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +131 -0
  317. data/vendor/eigen/Eigen/src/StlSupport/details.h +84 -0
  318. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +1027 -0
  319. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +506 -0
  320. data/vendor/eigen/Eigen/src/misc/Image.h +82 -0
  321. data/vendor/eigen/Eigen/src/misc/Kernel.h +79 -0
  322. data/vendor/eigen/Eigen/src/misc/RealSvd2x2.h +55 -0
  323. data/vendor/eigen/Eigen/src/misc/blas.h +440 -0
  324. data/vendor/eigen/Eigen/src/misc/lapack.h +152 -0
  325. data/vendor/eigen/Eigen/src/misc/lapacke.h +16291 -0
  326. data/vendor/eigen/Eigen/src/misc/lapacke_mangling.h +17 -0
  327. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +332 -0
  328. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +552 -0
  329. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +1058 -0
  330. data/vendor/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
  331. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +163 -0
  332. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
  333. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +85 -0
  334. data/vendor/eigen/README.md +3 -0
  335. data/vendor/eigen/bench/README.txt +55 -0
  336. data/vendor/eigen/bench/btl/COPYING +340 -0
  337. data/vendor/eigen/bench/btl/README +154 -0
  338. data/vendor/eigen/bench/tensors/README +21 -0
  339. data/vendor/eigen/blas/README.txt +6 -0
  340. data/vendor/eigen/demos/mandelbrot/README +10 -0
  341. data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
  342. data/vendor/eigen/demos/opengl/README +13 -0
  343. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1760 -0
  344. data/vendor/eigen/unsupported/README.txt +50 -0
  345. data/vendor/tomotopy/LICENSE +21 -0
  346. data/vendor/tomotopy/README.kr.rst +375 -0
  347. data/vendor/tomotopy/README.rst +382 -0
  348. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +362 -0
  349. data/vendor/tomotopy/src/Labeling/FoRelevance.h +88 -0
  350. data/vendor/tomotopy/src/Labeling/Labeler.h +50 -0
  351. data/vendor/tomotopy/src/TopicModel/CT.h +37 -0
  352. data/vendor/tomotopy/src/TopicModel/CTModel.cpp +13 -0
  353. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +293 -0
  354. data/vendor/tomotopy/src/TopicModel/DMR.h +51 -0
  355. data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +13 -0
  356. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +374 -0
  357. data/vendor/tomotopy/src/TopicModel/DT.h +65 -0
  358. data/vendor/tomotopy/src/TopicModel/DTM.h +22 -0
  359. data/vendor/tomotopy/src/TopicModel/DTModel.cpp +15 -0
  360. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +572 -0
  361. data/vendor/tomotopy/src/TopicModel/GDMR.h +37 -0
  362. data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +14 -0
  363. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +485 -0
  364. data/vendor/tomotopy/src/TopicModel/HDP.h +74 -0
  365. data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +13 -0
  366. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +592 -0
  367. data/vendor/tomotopy/src/TopicModel/HLDA.h +40 -0
  368. data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +13 -0
  369. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +681 -0
  370. data/vendor/tomotopy/src/TopicModel/HPA.h +27 -0
  371. data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +21 -0
  372. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +588 -0
  373. data/vendor/tomotopy/src/TopicModel/LDA.h +144 -0
  374. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +442 -0
  375. data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +13 -0
  376. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +1058 -0
  377. data/vendor/tomotopy/src/TopicModel/LLDA.h +45 -0
  378. data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +13 -0
  379. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +203 -0
  380. data/vendor/tomotopy/src/TopicModel/MGLDA.h +63 -0
  381. data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +17 -0
  382. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +558 -0
  383. data/vendor/tomotopy/src/TopicModel/PA.h +43 -0
  384. data/vendor/tomotopy/src/TopicModel/PAModel.cpp +13 -0
  385. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +467 -0
  386. data/vendor/tomotopy/src/TopicModel/PLDA.h +17 -0
  387. data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +13 -0
  388. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +214 -0
  389. data/vendor/tomotopy/src/TopicModel/SLDA.h +54 -0
  390. data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +17 -0
  391. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +456 -0
  392. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +692 -0
  393. data/vendor/tomotopy/src/Utils/AliasMethod.hpp +169 -0
  394. data/vendor/tomotopy/src/Utils/Dictionary.h +80 -0
  395. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +181 -0
  396. data/vendor/tomotopy/src/Utils/LBFGS.h +202 -0
  397. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBacktracking.h +120 -0
  398. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBracketing.h +122 -0
  399. data/vendor/tomotopy/src/Utils/LBFGS/Param.h +213 -0
  400. data/vendor/tomotopy/src/Utils/LUT.hpp +82 -0
  401. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +69 -0
  402. data/vendor/tomotopy/src/Utils/PolyaGamma.hpp +200 -0
  403. data/vendor/tomotopy/src/Utils/PolyaGammaHybrid.hpp +672 -0
  404. data/vendor/tomotopy/src/Utils/ThreadPool.hpp +150 -0
  405. data/vendor/tomotopy/src/Utils/Trie.hpp +220 -0
  406. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +94 -0
  407. data/vendor/tomotopy/src/Utils/Utils.hpp +337 -0
  408. data/vendor/tomotopy/src/Utils/avx_gamma.h +46 -0
  409. data/vendor/tomotopy/src/Utils/avx_mathfun.h +736 -0
  410. data/vendor/tomotopy/src/Utils/exception.h +28 -0
  411. data/vendor/tomotopy/src/Utils/math.h +281 -0
  412. data/vendor/tomotopy/src/Utils/rtnorm.hpp +2690 -0
  413. data/vendor/tomotopy/src/Utils/sample.hpp +192 -0
  414. data/vendor/tomotopy/src/Utils/serializer.hpp +695 -0
  415. data/vendor/tomotopy/src/Utils/slp.hpp +131 -0
  416. data/vendor/tomotopy/src/Utils/sse_gamma.h +48 -0
  417. data/vendor/tomotopy/src/Utils/sse_mathfun.h +710 -0
  418. data/vendor/tomotopy/src/Utils/text.hpp +49 -0
  419. data/vendor/tomotopy/src/Utils/tvector.hpp +543 -0
  420. metadata +531 -0
@@ -0,0 +1,91 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
11
+ #define EIGEN_MATH_FUNCTIONS_CUDA_H
12
+
13
+ namespace Eigen {
14
+
15
+ namespace internal {
16
+
17
+ // Make sure this is only available when targeting a GPU: we don't want to
18
+ // introduce conflicts between these packet_traits definitions and the ones
19
+ // we'll use on the host side (SSE, AVX, ...)
20
+ #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
21
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
22
+ float4 plog<float4>(const float4& a)
23
+ {
24
+ return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
25
+ }
26
+
27
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
28
+ double2 plog<double2>(const double2& a)
29
+ {
30
+ using ::log;
31
+ return make_double2(log(a.x), log(a.y));
32
+ }
33
+
34
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
35
+ float4 plog1p<float4>(const float4& a)
36
+ {
37
+ return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w));
38
+ }
39
+
40
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
41
+ double2 plog1p<double2>(const double2& a)
42
+ {
43
+ return make_double2(log1p(a.x), log1p(a.y));
44
+ }
45
+
46
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
47
+ float4 pexp<float4>(const float4& a)
48
+ {
49
+ return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
50
+ }
51
+
52
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
53
+ double2 pexp<double2>(const double2& a)
54
+ {
55
+ using ::exp;
56
+ return make_double2(exp(a.x), exp(a.y));
57
+ }
58
+
59
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
60
+ float4 psqrt<float4>(const float4& a)
61
+ {
62
+ return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
63
+ }
64
+
65
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
66
+ double2 psqrt<double2>(const double2& a)
67
+ {
68
+ using ::sqrt;
69
+ return make_double2(sqrt(a.x), sqrt(a.y));
70
+ }
71
+
72
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
73
+ float4 prsqrt<float4>(const float4& a)
74
+ {
75
+ return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
76
+ }
77
+
78
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
79
+ double2 prsqrt<double2>(const double2& a)
80
+ {
81
+ return make_double2(rsqrt(a.x), rsqrt(a.y));
82
+ }
83
+
84
+
85
+ #endif
86
+
87
+ } // end namespace internal
88
+
89
+ } // end namespace Eigen
90
+
91
+ #endif // EIGEN_MATH_FUNCTIONS_CUDA_H
@@ -0,0 +1,333 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_PACKET_MATH_CUDA_H
11
+ #define EIGEN_PACKET_MATH_CUDA_H
12
+
13
+ namespace Eigen {
14
+
15
+ namespace internal {
16
+
17
+ // Make sure this is only available when targeting a GPU: we don't want to
18
+ // introduce conflicts between these packet_traits definitions and the ones
19
+ // we'll use on the host side (SSE, AVX, ...)
20
+ #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
21
+ template<> struct is_arithmetic<float4> { enum { value = true }; };
22
+ template<> struct is_arithmetic<double2> { enum { value = true }; };
23
+
24
+ template<> struct packet_traits<float> : default_packet_traits
25
+ {
26
+ typedef float4 type;
27
+ typedef float4 half;
28
+ enum {
29
+ Vectorizable = 1,
30
+ AlignedOnScalar = 1,
31
+ size=4,
32
+ HasHalfPacket = 0,
33
+
34
+ HasDiv = 1,
35
+ HasSin = 0,
36
+ HasCos = 0,
37
+ HasLog = 1,
38
+ HasExp = 1,
39
+ HasSqrt = 1,
40
+ HasRsqrt = 1,
41
+ HasLGamma = 1,
42
+ HasDiGamma = 1,
43
+ HasZeta = 1,
44
+ HasPolygamma = 1,
45
+ HasErf = 1,
46
+ HasErfc = 1,
47
+ HasIGamma = 1,
48
+ HasIGammac = 1,
49
+ HasBetaInc = 1,
50
+
51
+ HasBlend = 0,
52
+ };
53
+ };
54
+
55
+ template<> struct packet_traits<double> : default_packet_traits
56
+ {
57
+ typedef double2 type;
58
+ typedef double2 half;
59
+ enum {
60
+ Vectorizable = 1,
61
+ AlignedOnScalar = 1,
62
+ size=2,
63
+ HasHalfPacket = 0,
64
+
65
+ HasDiv = 1,
66
+ HasLog = 1,
67
+ HasExp = 1,
68
+ HasSqrt = 1,
69
+ HasRsqrt = 1,
70
+ HasLGamma = 1,
71
+ HasDiGamma = 1,
72
+ HasZeta = 1,
73
+ HasPolygamma = 1,
74
+ HasErf = 1,
75
+ HasErfc = 1,
76
+ HasIGamma = 1,
77
+ HasIGammac = 1,
78
+ HasBetaInc = 1,
79
+
80
+ HasBlend = 0,
81
+ };
82
+ };
83
+
84
+
85
+ template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
86
+ template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
87
+
88
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
89
+ return make_float4(from, from, from, from);
90
+ }
91
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
92
+ return make_double2(from, from);
93
+ }
94
+
95
+
96
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
97
+ return make_float4(a, a+1, a+2, a+3);
98
+ }
99
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
100
+ return make_double2(a, a+1);
101
+ }
102
+
103
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
104
+ return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
105
+ }
106
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
107
+ return make_double2(a.x+b.x, a.y+b.y);
108
+ }
109
+
110
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
111
+ return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
112
+ }
113
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
114
+ return make_double2(a.x-b.x, a.y-b.y);
115
+ }
116
+
117
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
118
+ return make_float4(-a.x, -a.y, -a.z, -a.w);
119
+ }
120
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
121
+ return make_double2(-a.x, -a.y);
122
+ }
123
+
124
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
125
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
126
+
127
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
128
+ return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
129
+ }
130
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
131
+ return make_double2(a.x*b.x, a.y*b.y);
132
+ }
133
+
134
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
135
+ return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
136
+ }
137
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
138
+ return make_double2(a.x/b.x, a.y/b.y);
139
+ }
140
+
141
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
142
+ return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
143
+ }
144
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
145
+ return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
146
+ }
147
+
148
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
149
+ return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
150
+ }
151
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
152
+ return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
153
+ }
154
+
155
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
156
+ return *reinterpret_cast<const float4*>(from);
157
+ }
158
+
159
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
160
+ return *reinterpret_cast<const double2*>(from);
161
+ }
162
+
163
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
164
+ return make_float4(from[0], from[1], from[2], from[3]);
165
+ }
166
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
167
+ return make_double2(from[0], from[1]);
168
+ }
169
+
170
+ template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
171
+ return make_float4(from[0], from[0], from[1], from[1]);
172
+ }
173
+ template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
174
+ return make_double2(from[0], from[0]);
175
+ }
176
+
177
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
178
+ *reinterpret_cast<float4*>(to) = from;
179
+ }
180
+
181
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
182
+ *reinterpret_cast<double2*>(to) = from;
183
+ }
184
+
185
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
186
+ to[0] = from.x;
187
+ to[1] = from.y;
188
+ to[2] = from.z;
189
+ to[3] = from.w;
190
+ }
191
+
192
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
193
+ to[0] = from.x;
194
+ to[1] = from.y;
195
+ }
196
+
197
+ template<>
198
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
199
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
200
+ return __ldg((const float4*)from);
201
+ #else
202
+ return make_float4(from[0], from[1], from[2], from[3]);
203
+ #endif
204
+ }
205
+ template<>
206
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
207
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
208
+ return __ldg((const double2*)from);
209
+ #else
210
+ return make_double2(from[0], from[1]);
211
+ #endif
212
+ }
213
+
214
+ template<>
215
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
216
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
217
+ return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
218
+ #else
219
+ return make_float4(from[0], from[1], from[2], from[3]);
220
+ #endif
221
+ }
222
+ template<>
223
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
224
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
225
+ return make_double2(__ldg(from+0), __ldg(from+1));
226
+ #else
227
+ return make_double2(from[0], from[1]);
228
+ #endif
229
+ }
230
+
231
+ template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
232
+ return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
233
+ }
234
+
235
+ template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
236
+ return make_double2(from[0*stride], from[1*stride]);
237
+ }
238
+
239
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
240
+ to[stride*0] = from.x;
241
+ to[stride*1] = from.y;
242
+ to[stride*2] = from.z;
243
+ to[stride*3] = from.w;
244
+ }
245
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
246
+ to[stride*0] = from.x;
247
+ to[stride*1] = from.y;
248
+ }
249
+
250
+ template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
251
+ return a.x;
252
+ }
253
+ template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
254
+ return a.x;
255
+ }
256
+
257
+ template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
258
+ return a.x + a.y + a.z + a.w;
259
+ }
260
+ template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
261
+ return a.x + a.y;
262
+ }
263
+
264
+ template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
265
+ return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
266
+ }
267
+ template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
268
+ return fmax(a.x, a.y);
269
+ }
270
+
271
+ template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
272
+ return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
273
+ }
274
+ template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
275
+ return fmin(a.x, a.y);
276
+ }
277
+
278
+ template<> EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
279
+ return a.x * a.y * a.z * a.w;
280
+ }
281
+ template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
282
+ return a.x * a.y;
283
+ }
284
+
285
+ template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
286
+ return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
287
+ }
288
+ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
289
+ return make_double2(fabs(a.x), fabs(a.y));
290
+ }
291
+
292
+ EIGEN_DEVICE_FUNC inline void
293
+ ptranspose(PacketBlock<float4,4>& kernel) {
294
+ float tmp = kernel.packet[0].y;
295
+ kernel.packet[0].y = kernel.packet[1].x;
296
+ kernel.packet[1].x = tmp;
297
+
298
+ tmp = kernel.packet[0].z;
299
+ kernel.packet[0].z = kernel.packet[2].x;
300
+ kernel.packet[2].x = tmp;
301
+
302
+ tmp = kernel.packet[0].w;
303
+ kernel.packet[0].w = kernel.packet[3].x;
304
+ kernel.packet[3].x = tmp;
305
+
306
+ tmp = kernel.packet[1].z;
307
+ kernel.packet[1].z = kernel.packet[2].y;
308
+ kernel.packet[2].y = tmp;
309
+
310
+ tmp = kernel.packet[1].w;
311
+ kernel.packet[1].w = kernel.packet[3].y;
312
+ kernel.packet[3].y = tmp;
313
+
314
+ tmp = kernel.packet[2].w;
315
+ kernel.packet[2].w = kernel.packet[3].z;
316
+ kernel.packet[3].z = tmp;
317
+ }
318
+
319
+ EIGEN_DEVICE_FUNC inline void
320
+ ptranspose(PacketBlock<double2,2>& kernel) {
321
+ double tmp = kernel.packet[0].y;
322
+ kernel.packet[0].y = kernel.packet[1].x;
323
+ kernel.packet[1].x = tmp;
324
+ }
325
+
326
+ #endif
327
+
328
+ } // end namespace internal
329
+
330
+ } // end namespace Eigen
331
+
332
+
333
+ #endif // EIGEN_PACKET_MATH_CUDA_H
@@ -0,0 +1,1124 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
11
+ #define EIGEN_PACKET_MATH_HALF_CUDA_H
12
+
13
+
14
+ namespace Eigen {
15
+ namespace internal {
16
+
17
+ // Most of the following operations require arch >= 3.0
18
+ #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
19
+
20
+ template<> struct is_arithmetic<half2> { enum { value = true }; };
21
+
22
+ template<> struct packet_traits<Eigen::half> : default_packet_traits
23
+ {
24
+ typedef half2 type;
25
+ typedef half2 half;
26
+ enum {
27
+ Vectorizable = 1,
28
+ AlignedOnScalar = 1,
29
+ size=2,
30
+ HasHalfPacket = 0,
31
+ HasAdd = 1,
32
+ HasMul = 1,
33
+ HasDiv = 1,
34
+ HasSqrt = 1,
35
+ HasRsqrt = 1,
36
+ HasExp = 1,
37
+ HasLog = 1,
38
+ HasLog1p = 1
39
+ };
40
+ };
41
+
42
+ template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
43
+
44
+ template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
45
+ return __half2half2(from);
46
+ }
47
+
48
+ template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
49
+ return *reinterpret_cast<const half2*>(from);
50
+ }
51
+
52
+ template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
53
+ return __halves2half2(from[0], from[1]);
54
+ }
55
+
56
+ template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
57
+ return __halves2half2(from[0], from[0]);
58
+ }
59
+
60
+ template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
61
+ *reinterpret_cast<half2*>(to) = from;
62
+ }
63
+
64
+ template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
65
+ to[0] = __low2half(from);
66
+ to[1] = __high2half(from);
67
+ }
68
+
69
+ template<>
70
+ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
71
+ #if __CUDA_ARCH__ >= 350
72
+ return __ldg((const half2*)from);
73
+ #else
74
+ return __halves2half2(*(from+0), *(from+1));
75
+ #endif
76
+ }
77
+
78
+ template<>
79
+ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
80
+ #if __CUDA_ARCH__ >= 350
81
+ return __halves2half2(__ldg(from+0), __ldg(from+1));
82
+ #else
83
+ return __halves2half2(*(from+0), *(from+1));
84
+ #endif
85
+ }
86
+
87
+ template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
88
+ return __halves2half2(from[0*stride], from[1*stride]);
89
+ }
90
+
91
+ template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
92
+ to[stride*0] = __low2half(from);
93
+ to[stride*1] = __high2half(from);
94
+ }
95
+
96
+ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
97
+ return __low2half(a);
98
+ }
99
+
100
+ template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
101
+ half2 result;
102
+ unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
103
+ *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
104
+ return result;
105
+ }
106
+
107
+
108
+ __device__ EIGEN_STRONG_INLINE void
109
+ ptranspose(PacketBlock<half2,2>& kernel) {
110
+ __half a1 = __low2half(kernel.packet[0]);
111
+ __half a2 = __high2half(kernel.packet[0]);
112
+ __half b1 = __low2half(kernel.packet[1]);
113
+ __half b2 = __high2half(kernel.packet[1]);
114
+ kernel.packet[0] = __halves2half2(a1, b1);
115
+ kernel.packet[1] = __halves2half2(a2, b2);
116
+ }
117
+
118
+ template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
119
+ #if __CUDA_ARCH__ >= 530
120
+ return __halves2half2(a, __hadd(a, __float2half(1.0f)));
121
+ #else
122
+ float f = __half2float(a) + 1.0f;
123
+ return __halves2half2(a, __float2half(f));
124
+ #endif
125
+ }
126
+
127
+ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
128
+ #if __CUDA_ARCH__ >= 530
129
+ return __hadd2(a, b);
130
+ #else
131
+ float a1 = __low2float(a);
132
+ float a2 = __high2float(a);
133
+ float b1 = __low2float(b);
134
+ float b2 = __high2float(b);
135
+ float r1 = a1 + b1;
136
+ float r2 = a2 + b2;
137
+ return __floats2half2_rn(r1, r2);
138
+ #endif
139
+ }
140
+
141
+ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
142
+ #if __CUDA_ARCH__ >= 530
143
+ return __hsub2(a, b);
144
+ #else
145
+ float a1 = __low2float(a);
146
+ float a2 = __high2float(a);
147
+ float b1 = __low2float(b);
148
+ float b2 = __high2float(b);
149
+ float r1 = a1 - b1;
150
+ float r2 = a2 - b2;
151
+ return __floats2half2_rn(r1, r2);
152
+ #endif
153
+ }
154
+
155
+ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
156
+ #if __CUDA_ARCH__ >= 530
157
+ return __hneg2(a);
158
+ #else
159
+ float a1 = __low2float(a);
160
+ float a2 = __high2float(a);
161
+ return __floats2half2_rn(-a1, -a2);
162
+ #endif
163
+ }
164
+
165
+ template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
166
+
167
+ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
168
+ #if __CUDA_ARCH__ >= 530
169
+ return __hmul2(a, b);
170
+ #else
171
+ float a1 = __low2float(a);
172
+ float a2 = __high2float(a);
173
+ float b1 = __low2float(b);
174
+ float b2 = __high2float(b);
175
+ float r1 = a1 * b1;
176
+ float r2 = a2 * b2;
177
+ return __floats2half2_rn(r1, r2);
178
+ #endif
179
+ }
180
+
181
+ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
182
+ #if __CUDA_ARCH__ >= 530
183
+ return __hfma2(a, b, c);
184
+ #else
185
+ float a1 = __low2float(a);
186
+ float a2 = __high2float(a);
187
+ float b1 = __low2float(b);
188
+ float b2 = __high2float(b);
189
+ float c1 = __low2float(c);
190
+ float c2 = __high2float(c);
191
+ float r1 = a1 * b1 + c1;
192
+ float r2 = a2 * b2 + c2;
193
+ return __floats2half2_rn(r1, r2);
194
+ #endif
195
+ }
196
+
197
+ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
198
+ float a1 = __low2float(a);
199
+ float a2 = __high2float(a);
200
+ float b1 = __low2float(b);
201
+ float b2 = __high2float(b);
202
+ float r1 = a1 / b1;
203
+ float r2 = a2 / b2;
204
+ return __floats2half2_rn(r1, r2);
205
+ }
206
+
207
+ template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
208
+ float a1 = __low2float(a);
209
+ float a2 = __high2float(a);
210
+ float b1 = __low2float(b);
211
+ float b2 = __high2float(b);
212
+ __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
213
+ __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
214
+ return __halves2half2(r1, r2);
215
+ }
216
+
217
+ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
218
+ float a1 = __low2float(a);
219
+ float a2 = __high2float(a);
220
+ float b1 = __low2float(b);
221
+ float b2 = __high2float(b);
222
+ __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
223
+ __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
224
+ return __halves2half2(r1, r2);
225
+ }
226
+
227
+ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
228
+ #if __CUDA_ARCH__ >= 530
229
+ return __hadd(__low2half(a), __high2half(a));
230
+ #else
231
+ float a1 = __low2float(a);
232
+ float a2 = __high2float(a);
233
+ return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
234
+ #endif
235
+ }
236
+
237
+ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
238
+ #if __CUDA_ARCH__ >= 530
239
+ __half first = __low2half(a);
240
+ __half second = __high2half(a);
241
+ return __hgt(first, second) ? first : second;
242
+ #else
243
+ float a1 = __low2float(a);
244
+ float a2 = __high2float(a);
245
+ return a1 > a2 ? __low2half(a) : __high2half(a);
246
+ #endif
247
+ }
248
+
249
+ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
250
+ #if __CUDA_ARCH__ >= 530
251
+ __half first = __low2half(a);
252
+ __half second = __high2half(a);
253
+ return __hlt(first, second) ? first : second;
254
+ #else
255
+ float a1 = __low2float(a);
256
+ float a2 = __high2float(a);
257
+ return a1 < a2 ? __low2half(a) : __high2half(a);
258
+ #endif
259
+ }
260
+
261
+ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
262
+ #if __CUDA_ARCH__ >= 530
263
+ return __hmul(__low2half(a), __high2half(a));
264
+ #else
265
+ float a1 = __low2float(a);
266
+ float a2 = __high2float(a);
267
+ return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
268
+ #endif
269
+ }
270
+
271
+ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
272
+ float a1 = __low2float(a);
273
+ float a2 = __high2float(a);
274
+ float r1 = log1pf(a1);
275
+ float r2 = log1pf(a2);
276
+ return __floats2half2_rn(r1, r2);
277
+ }
278
+
279
+ #if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
280
+
281
+ template<> __device__ EIGEN_STRONG_INLINE
282
+ half2 plog<half2>(const half2& a) {
283
+ return h2log(a);
284
+ }
285
+
286
+ template<> __device__ EIGEN_STRONG_INLINE
287
+ half2 pexp<half2>(const half2& a) {
288
+ return h2exp(a);
289
+ }
290
+
291
+ template<> __device__ EIGEN_STRONG_INLINE
292
+ half2 psqrt<half2>(const half2& a) {
293
+ return h2sqrt(a);
294
+ }
295
+
296
+ template<> __device__ EIGEN_STRONG_INLINE
297
+ half2 prsqrt<half2>(const half2& a) {
298
+ return h2rsqrt(a);
299
+ }
300
+
301
+ #else
302
+
303
+ template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
304
+ float a1 = __low2float(a);
305
+ float a2 = __high2float(a);
306
+ float r1 = logf(a1);
307
+ float r2 = logf(a2);
308
+ return __floats2half2_rn(r1, r2);
309
+ }
310
+
311
+ template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
312
+ float a1 = __low2float(a);
313
+ float a2 = __high2float(a);
314
+ float r1 = expf(a1);
315
+ float r2 = expf(a2);
316
+ return __floats2half2_rn(r1, r2);
317
+ }
318
+
319
+ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
320
+ float a1 = __low2float(a);
321
+ float a2 = __high2float(a);
322
+ float r1 = sqrtf(a1);
323
+ float r2 = sqrtf(a2);
324
+ return __floats2half2_rn(r1, r2);
325
+ }
326
+
327
+ template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
328
+ float a1 = __low2float(a);
329
+ float a2 = __high2float(a);
330
+ float r1 = rsqrtf(a1);
331
+ float r2 = rsqrtf(a2);
332
+ return __floats2half2_rn(r1, r2);
333
+ }
334
+
335
+ #endif
336
+
337
+ #elif defined EIGEN_VECTORIZE_AVX512
338
+
339
+ typedef struct {
340
+ __m256i x;
341
+ } Packet16h;
342
+
343
+
344
+ template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
345
+
346
+ template <>
347
+ struct packet_traits<half> : default_packet_traits {
348
+ typedef Packet16h type;
349
+ // There is no half-size packet for Packet16h.
350
+ typedef Packet16h half;
351
+ enum {
352
+ Vectorizable = 1,
353
+ AlignedOnScalar = 1,
354
+ size = 16,
355
+ HasHalfPacket = 0,
356
+ HasAdd = 0,
357
+ HasSub = 0,
358
+ HasMul = 0,
359
+ HasNegate = 0,
360
+ HasAbs = 0,
361
+ HasAbs2 = 0,
362
+ HasMin = 0,
363
+ HasMax = 0,
364
+ HasConj = 0,
365
+ HasSetLinear = 0,
366
+ HasDiv = 0,
367
+ HasSqrt = 0,
368
+ HasRsqrt = 0,
369
+ HasExp = 0,
370
+ HasLog = 0,
371
+ HasBlend = 0
372
+ };
373
+ };
374
+
375
+
376
+ template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
377
+
378
+ template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
379
+ Packet16h result;
380
+ result.x = _mm256_set1_epi16(from.x);
381
+ return result;
382
+ }
383
+
384
+ template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
385
+ return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
386
+ }
387
+
388
+ template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
389
+ Packet16h result;
390
+ result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
391
+ return result;
392
+ }
393
+
394
+ template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
395
+ Packet16h result;
396
+ result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
397
+ return result;
398
+ }
399
+
400
+ template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
401
+ _mm256_store_si256((__m256i*)to, from.x);
402
+ }
403
+
404
+ template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
405
+ _mm256_storeu_si256((__m256i*)to, from.x);
406
+ }
407
+
408
+ template<> EIGEN_STRONG_INLINE Packet16h
409
+ ploadquad(const Eigen::half* from) {
410
+ Packet16h result;
411
+ unsigned short a = from[0].x;
412
+ unsigned short b = from[1].x;
413
+ unsigned short c = from[2].x;
414
+ unsigned short d = from[3].x;
415
+ result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
416
+ return result;
417
+ }
418
+
419
+ EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
420
+ #ifdef EIGEN_HAS_FP16_C
421
+ return _mm512_cvtph_ps(a.x);
422
+ #else
423
+ EIGEN_ALIGN64 half aux[16];
424
+ pstore(aux, a);
425
+ float f0(aux[0]);
426
+ float f1(aux[1]);
427
+ float f2(aux[2]);
428
+ float f3(aux[3]);
429
+ float f4(aux[4]);
430
+ float f5(aux[5]);
431
+ float f6(aux[6]);
432
+ float f7(aux[7]);
433
+ float f8(aux[8]);
434
+ float f9(aux[9]);
435
+ float fa(aux[10]);
436
+ float fb(aux[11]);
437
+ float fc(aux[12]);
438
+ float fd(aux[13]);
439
+ float fe(aux[14]);
440
+ float ff(aux[15]);
441
+
442
+ return _mm512_set_ps(
443
+ ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
444
+ #endif
445
+ }
446
+
447
+ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
448
+ #ifdef EIGEN_HAS_FP16_C
449
+ Packet16h result;
450
+ result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
451
+ return result;
452
+ #else
453
+ EIGEN_ALIGN64 float aux[16];
454
+ pstore(aux, a);
455
+ half h0(aux[0]);
456
+ half h1(aux[1]);
457
+ half h2(aux[2]);
458
+ half h3(aux[3]);
459
+ half h4(aux[4]);
460
+ half h5(aux[5]);
461
+ half h6(aux[6]);
462
+ half h7(aux[7]);
463
+ half h8(aux[8]);
464
+ half h9(aux[9]);
465
+ half ha(aux[10]);
466
+ half hb(aux[11]);
467
+ half hc(aux[12]);
468
+ half hd(aux[13]);
469
+ half he(aux[14]);
470
+ half hf(aux[15]);
471
+
472
+ Packet16h result;
473
+ result.x = _mm256_set_epi16(
474
+ hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
475
+ h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
476
+ return result;
477
+ #endif
478
+ }
479
+
480
+ template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
481
+ Packet16f af = half2float(a);
482
+ Packet16f bf = half2float(b);
483
+ Packet16f rf = padd(af, bf);
484
+ return float2half(rf);
485
+ }
486
+
487
+ template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
488
+ Packet16f af = half2float(a);
489
+ Packet16f bf = half2float(b);
490
+ Packet16f rf = pmul(af, bf);
491
+ return float2half(rf);
492
+ }
493
+
494
+ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
495
+ Packet16f from_float = half2float(from);
496
+ return half(predux(from_float));
497
+ }
498
+
499
+ template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
500
+ {
501
+ Packet16h result;
502
+ result.x = _mm256_set_epi16(
503
+ from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
504
+ from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
505
+ from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
506
+ from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
507
+ return result;
508
+ }
509
+
510
+ template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
511
+ {
512
+ EIGEN_ALIGN64 half aux[16];
513
+ pstore(aux, from);
514
+ to[stride*0].x = aux[0].x;
515
+ to[stride*1].x = aux[1].x;
516
+ to[stride*2].x = aux[2].x;
517
+ to[stride*3].x = aux[3].x;
518
+ to[stride*4].x = aux[4].x;
519
+ to[stride*5].x = aux[5].x;
520
+ to[stride*6].x = aux[6].x;
521
+ to[stride*7].x = aux[7].x;
522
+ to[stride*8].x = aux[8].x;
523
+ to[stride*9].x = aux[9].x;
524
+ to[stride*10].x = aux[10].x;
525
+ to[stride*11].x = aux[11].x;
526
+ to[stride*12].x = aux[12].x;
527
+ to[stride*13].x = aux[13].x;
528
+ to[stride*14].x = aux[14].x;
529
+ to[stride*15].x = aux[15].x;
530
+ }
531
+
532
+ EIGEN_STRONG_INLINE void
533
+ ptranspose(PacketBlock<Packet16h,16>& kernel) {
534
+ __m256i a = kernel.packet[0].x;
535
+ __m256i b = kernel.packet[1].x;
536
+ __m256i c = kernel.packet[2].x;
537
+ __m256i d = kernel.packet[3].x;
538
+ __m256i e = kernel.packet[4].x;
539
+ __m256i f = kernel.packet[5].x;
540
+ __m256i g = kernel.packet[6].x;
541
+ __m256i h = kernel.packet[7].x;
542
+ __m256i i = kernel.packet[8].x;
543
+ __m256i j = kernel.packet[9].x;
544
+ __m256i k = kernel.packet[10].x;
545
+ __m256i l = kernel.packet[11].x;
546
+ __m256i m = kernel.packet[12].x;
547
+ __m256i n = kernel.packet[13].x;
548
+ __m256i o = kernel.packet[14].x;
549
+ __m256i p = kernel.packet[15].x;
550
+
551
+ __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
552
+ __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
553
+ __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
554
+ __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
555
+ __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
556
+ __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
557
+ __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
558
+ __m256i op_07 = _mm256_unpacklo_epi16(o, p);
559
+
560
+ __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
561
+ __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
562
+ __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
563
+ __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
564
+ __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
565
+ __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
566
+ __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
567
+ __m256i op_8f = _mm256_unpackhi_epi16(o, p);
568
+
569
+ __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
570
+ __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
571
+ __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
572
+ __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
573
+ __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
574
+ __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
575
+ __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
576
+ __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
577
+
578
+ __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
579
+ __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
580
+ __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
581
+ __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
582
+ __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
583
+ __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
584
+ __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
585
+ __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
586
+
587
+ __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
588
+ __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
589
+ __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
590
+ __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
591
+ __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
592
+ __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
593
+ __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
594
+ __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
595
+ __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
596
+ __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
597
+ __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
598
+ __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
599
+ __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
600
+ __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
601
+ __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
602
+ __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
603
+
604
+ // NOTE: no unpacklo/hi instr in this case, so using permute instr.
605
+ __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
606
+ __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
607
+ __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
608
+ __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
609
+ __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
610
+ __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
611
+ __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
612
+ __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
613
+ __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
614
+ __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
615
+ __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
616
+ __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
617
+ __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
618
+ __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
619
+ __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
620
+ __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
621
+
622
+ kernel.packet[0].x = a_p_0;
623
+ kernel.packet[1].x = a_p_1;
624
+ kernel.packet[2].x = a_p_2;
625
+ kernel.packet[3].x = a_p_3;
626
+ kernel.packet[4].x = a_p_4;
627
+ kernel.packet[5].x = a_p_5;
628
+ kernel.packet[6].x = a_p_6;
629
+ kernel.packet[7].x = a_p_7;
630
+ kernel.packet[8].x = a_p_8;
631
+ kernel.packet[9].x = a_p_9;
632
+ kernel.packet[10].x = a_p_a;
633
+ kernel.packet[11].x = a_p_b;
634
+ kernel.packet[12].x = a_p_c;
635
+ kernel.packet[13].x = a_p_d;
636
+ kernel.packet[14].x = a_p_e;
637
+ kernel.packet[15].x = a_p_f;
638
+ }
639
+
640
+ EIGEN_STRONG_INLINE void
641
+ ptranspose(PacketBlock<Packet16h,8>& kernel) {
642
+ EIGEN_ALIGN64 half in[8][16];
643
+ pstore<half>(in[0], kernel.packet[0]);
644
+ pstore<half>(in[1], kernel.packet[1]);
645
+ pstore<half>(in[2], kernel.packet[2]);
646
+ pstore<half>(in[3], kernel.packet[3]);
647
+ pstore<half>(in[4], kernel.packet[4]);
648
+ pstore<half>(in[5], kernel.packet[5]);
649
+ pstore<half>(in[6], kernel.packet[6]);
650
+ pstore<half>(in[7], kernel.packet[7]);
651
+
652
+ EIGEN_ALIGN64 half out[8][16];
653
+
654
+ for (int i = 0; i < 8; ++i) {
655
+ for (int j = 0; j < 8; ++j) {
656
+ out[i][j] = in[j][2*i];
657
+ }
658
+ for (int j = 0; j < 8; ++j) {
659
+ out[i][j+8] = in[j][2*i+1];
660
+ }
661
+ }
662
+
663
+ kernel.packet[0] = pload<Packet16h>(out[0]);
664
+ kernel.packet[1] = pload<Packet16h>(out[1]);
665
+ kernel.packet[2] = pload<Packet16h>(out[2]);
666
+ kernel.packet[3] = pload<Packet16h>(out[3]);
667
+ kernel.packet[4] = pload<Packet16h>(out[4]);
668
+ kernel.packet[5] = pload<Packet16h>(out[5]);
669
+ kernel.packet[6] = pload<Packet16h>(out[6]);
670
+ kernel.packet[7] = pload<Packet16h>(out[7]);
671
+ }
672
+
673
+ EIGEN_STRONG_INLINE void
674
+ ptranspose(PacketBlock<Packet16h,4>& kernel) {
675
+ EIGEN_ALIGN64 half in[4][16];
676
+ pstore<half>(in[0], kernel.packet[0]);
677
+ pstore<half>(in[1], kernel.packet[1]);
678
+ pstore<half>(in[2], kernel.packet[2]);
679
+ pstore<half>(in[3], kernel.packet[3]);
680
+
681
+ EIGEN_ALIGN64 half out[4][16];
682
+
683
+ for (int i = 0; i < 4; ++i) {
684
+ for (int j = 0; j < 4; ++j) {
685
+ out[i][j] = in[j][4*i];
686
+ }
687
+ for (int j = 0; j < 4; ++j) {
688
+ out[i][j+4] = in[j][4*i+1];
689
+ }
690
+ for (int j = 0; j < 4; ++j) {
691
+ out[i][j+8] = in[j][4*i+2];
692
+ }
693
+ for (int j = 0; j < 4; ++j) {
694
+ out[i][j+12] = in[j][4*i+3];
695
+ }
696
+ }
697
+
698
+ kernel.packet[0] = pload<Packet16h>(out[0]);
699
+ kernel.packet[1] = pload<Packet16h>(out[1]);
700
+ kernel.packet[2] = pload<Packet16h>(out[2]);
701
+ kernel.packet[3] = pload<Packet16h>(out[3]);
702
+ }
703
+
704
+
705
+ #elif defined EIGEN_VECTORIZE_AVX
706
+
707
+ typedef struct {
708
+ __m128i x;
709
+ } Packet8h;
710
+
711
+
712
+ template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
713
+
714
+ template <>
715
+ struct packet_traits<Eigen::half> : default_packet_traits {
716
+ typedef Packet8h type;
717
+ // There is no half-size packet for Packet8h.
718
+ typedef Packet8h half;
719
+ enum {
720
+ Vectorizable = 1,
721
+ AlignedOnScalar = 1,
722
+ size = 8,
723
+ HasHalfPacket = 0,
724
+ HasAdd = 0,
725
+ HasSub = 0,
726
+ HasMul = 0,
727
+ HasNegate = 0,
728
+ HasAbs = 0,
729
+ HasAbs2 = 0,
730
+ HasMin = 0,
731
+ HasMax = 0,
732
+ HasConj = 0,
733
+ HasSetLinear = 0,
734
+ HasDiv = 0,
735
+ HasSqrt = 0,
736
+ HasRsqrt = 0,
737
+ HasExp = 0,
738
+ HasLog = 0,
739
+ HasBlend = 0
740
+ };
741
+ };
742
+
743
+
744
+ template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
745
+
746
+ template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
747
+ Packet8h result;
748
+ result.x = _mm_set1_epi16(from.x);
749
+ return result;
750
+ }
751
+
752
+ template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
753
+ return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
754
+ }
755
+
756
+ template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
757
+ Packet8h result;
758
+ result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
759
+ return result;
760
+ }
761
+
762
+ template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
763
+ Packet8h result;
764
+ result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
765
+ return result;
766
+ }
767
+
768
+ template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
769
+ _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
770
+ }
771
+
772
+ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
773
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
774
+ }
775
+
776
+ template<> EIGEN_STRONG_INLINE Packet8h
777
+ ploadquad<Packet8h>(const Eigen::half* from) {
778
+ Packet8h result;
779
+ unsigned short a = from[0].x;
780
+ unsigned short b = from[1].x;
781
+ result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
782
+ return result;
783
+ }
784
+
785
+ EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
786
+ #ifdef EIGEN_HAS_FP16_C
787
+ return _mm256_cvtph_ps(a.x);
788
+ #else
789
+ EIGEN_ALIGN32 Eigen::half aux[8];
790
+ pstore(aux, a);
791
+ float f0(aux[0]);
792
+ float f1(aux[1]);
793
+ float f2(aux[2]);
794
+ float f3(aux[3]);
795
+ float f4(aux[4]);
796
+ float f5(aux[5]);
797
+ float f6(aux[6]);
798
+ float f7(aux[7]);
799
+
800
+ return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
801
+ #endif
802
+ }
803
+
804
+ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
805
+ #ifdef EIGEN_HAS_FP16_C
806
+ Packet8h result;
807
+ result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
808
+ return result;
809
+ #else
810
+ EIGEN_ALIGN32 float aux[8];
811
+ pstore(aux, a);
812
+ Eigen::half h0(aux[0]);
813
+ Eigen::half h1(aux[1]);
814
+ Eigen::half h2(aux[2]);
815
+ Eigen::half h3(aux[3]);
816
+ Eigen::half h4(aux[4]);
817
+ Eigen::half h5(aux[5]);
818
+ Eigen::half h6(aux[6]);
819
+ Eigen::half h7(aux[7]);
820
+
821
+ Packet8h result;
822
+ result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
823
+ return result;
824
+ #endif
825
+ }
826
+
827
+ template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
828
+
829
+ template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
830
+ Packet8f af = half2float(a);
831
+ Packet8f bf = half2float(b);
832
+ Packet8f rf = padd(af, bf);
833
+ return float2half(rf);
834
+ }
835
+
836
+ template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
837
+ Packet8f af = half2float(a);
838
+ Packet8f bf = half2float(b);
839
+ Packet8f rf = pmul(af, bf);
840
+ return float2half(rf);
841
+ }
842
+
843
+ template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
844
+ {
845
+ Packet8h result;
846
+ result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
847
+ return result;
848
+ }
849
+
850
+ template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
851
+ {
852
+ EIGEN_ALIGN32 Eigen::half aux[8];
853
+ pstore(aux, from);
854
+ to[stride*0].x = aux[0].x;
855
+ to[stride*1].x = aux[1].x;
856
+ to[stride*2].x = aux[2].x;
857
+ to[stride*3].x = aux[3].x;
858
+ to[stride*4].x = aux[4].x;
859
+ to[stride*5].x = aux[5].x;
860
+ to[stride*6].x = aux[6].x;
861
+ to[stride*7].x = aux[7].x;
862
+ }
863
+
864
+ template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
865
+ Packet8f af = half2float(a);
866
+ float reduced = predux<Packet8f>(af);
867
+ return Eigen::half(reduced);
868
+ }
869
+
870
+ template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
871
+ Packet8f af = half2float(a);
872
+ float reduced = predux_max<Packet8f>(af);
873
+ return Eigen::half(reduced);
874
+ }
875
+
876
+ template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
877
+ Packet8f af = half2float(a);
878
+ float reduced = predux_min<Packet8f>(af);
879
+ return Eigen::half(reduced);
880
+ }
881
+
882
+ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
883
+ Packet8f af = half2float(a);
884
+ float reduced = predux_mul<Packet8f>(af);
885
+ return Eigen::half(reduced);
886
+ }
887
+
888
+ EIGEN_STRONG_INLINE void
889
+ ptranspose(PacketBlock<Packet8h,8>& kernel) {
890
+ __m128i a = kernel.packet[0].x;
891
+ __m128i b = kernel.packet[1].x;
892
+ __m128i c = kernel.packet[2].x;
893
+ __m128i d = kernel.packet[3].x;
894
+ __m128i e = kernel.packet[4].x;
895
+ __m128i f = kernel.packet[5].x;
896
+ __m128i g = kernel.packet[6].x;
897
+ __m128i h = kernel.packet[7].x;
898
+
899
+ __m128i a03b03 = _mm_unpacklo_epi16(a, b);
900
+ __m128i c03d03 = _mm_unpacklo_epi16(c, d);
901
+ __m128i e03f03 = _mm_unpacklo_epi16(e, f);
902
+ __m128i g03h03 = _mm_unpacklo_epi16(g, h);
903
+ __m128i a47b47 = _mm_unpackhi_epi16(a, b);
904
+ __m128i c47d47 = _mm_unpackhi_epi16(c, d);
905
+ __m128i e47f47 = _mm_unpackhi_epi16(e, f);
906
+ __m128i g47h47 = _mm_unpackhi_epi16(g, h);
907
+
908
+ __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
909
+ __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
910
+ __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
911
+ __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
912
+ __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
913
+ __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
914
+ __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
915
+ __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
916
+
917
+ __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
918
+ __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
919
+ __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
920
+ __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
921
+ __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
922
+ __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
923
+ __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
924
+ __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
925
+
926
+ kernel.packet[0].x = a0b0c0d0e0f0g0h0;
927
+ kernel.packet[1].x = a1b1c1d1e1f1g1h1;
928
+ kernel.packet[2].x = a2b2c2d2e2f2g2h2;
929
+ kernel.packet[3].x = a3b3c3d3e3f3g3h3;
930
+ kernel.packet[4].x = a4b4c4d4e4f4g4h4;
931
+ kernel.packet[5].x = a5b5c5d5e5f5g5h5;
932
+ kernel.packet[6].x = a6b6c6d6e6f6g6h6;
933
+ kernel.packet[7].x = a7b7c7d7e7f7g7h7;
934
+ }
935
+
936
+ EIGEN_STRONG_INLINE void
937
+ ptranspose(PacketBlock<Packet8h,4>& kernel) {
938
+ EIGEN_ALIGN32 Eigen::half in[4][8];
939
+ pstore<Eigen::half>(in[0], kernel.packet[0]);
940
+ pstore<Eigen::half>(in[1], kernel.packet[1]);
941
+ pstore<Eigen::half>(in[2], kernel.packet[2]);
942
+ pstore<Eigen::half>(in[3], kernel.packet[3]);
943
+
944
+ EIGEN_ALIGN32 Eigen::half out[4][8];
945
+
946
+ for (int i = 0; i < 4; ++i) {
947
+ for (int j = 0; j < 4; ++j) {
948
+ out[i][j] = in[j][2*i];
949
+ }
950
+ for (int j = 0; j < 4; ++j) {
951
+ out[i][j+4] = in[j][2*i+1];
952
+ }
953
+ }
954
+
955
+ kernel.packet[0] = pload<Packet8h>(out[0]);
956
+ kernel.packet[1] = pload<Packet8h>(out[1]);
957
+ kernel.packet[2] = pload<Packet8h>(out[2]);
958
+ kernel.packet[3] = pload<Packet8h>(out[3]);
959
+ }
960
+
961
+
962
+ // Disable the following code since it's broken on too many platforms / compilers.
963
+ //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
964
+ #elif 0
965
+
966
+ typedef struct {
967
+ __m64 x;
968
+ } Packet4h;
969
+
970
+
971
+ template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
972
+
973
+ template <>
974
+ struct packet_traits<Eigen::half> : default_packet_traits {
975
+ typedef Packet4h type;
976
+ // There is no half-size packet for Packet4h.
977
+ typedef Packet4h half;
978
+ enum {
979
+ Vectorizable = 1,
980
+ AlignedOnScalar = 1,
981
+ size = 4,
982
+ HasHalfPacket = 0,
983
+ HasAdd = 0,
984
+ HasSub = 0,
985
+ HasMul = 0,
986
+ HasNegate = 0,
987
+ HasAbs = 0,
988
+ HasAbs2 = 0,
989
+ HasMin = 0,
990
+ HasMax = 0,
991
+ HasConj = 0,
992
+ HasSetLinear = 0,
993
+ HasDiv = 0,
994
+ HasSqrt = 0,
995
+ HasRsqrt = 0,
996
+ HasExp = 0,
997
+ HasLog = 0,
998
+ HasBlend = 0
999
+ };
1000
+ };
1001
+
1002
+
1003
+ template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
1004
+
1005
+ template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
1006
+ Packet4h result;
1007
+ result.x = _mm_set1_pi16(from.x);
1008
+ return result;
1009
+ }
1010
+
1011
+ template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
1012
+ return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
1013
+ }
1014
+
1015
+ template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
1016
+
1017
+ template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
1018
+ __int64_t a64 = _mm_cvtm64_si64(a.x);
1019
+ __int64_t b64 = _mm_cvtm64_si64(b.x);
1020
+
1021
+ Eigen::half h[4];
1022
+
1023
+ Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1024
+ Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1025
+ h[0] = ha + hb;
1026
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1027
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1028
+ h[1] = ha + hb;
1029
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1030
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1031
+ h[2] = ha + hb;
1032
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1033
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1034
+ h[3] = ha + hb;
1035
+ Packet4h result;
1036
+ result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1037
+ return result;
1038
+ }
1039
+
1040
+ template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
1041
+ __int64_t a64 = _mm_cvtm64_si64(a.x);
1042
+ __int64_t b64 = _mm_cvtm64_si64(b.x);
1043
+
1044
+ Eigen::half h[4];
1045
+
1046
+ Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1047
+ Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1048
+ h[0] = ha * hb;
1049
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1050
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1051
+ h[1] = ha * hb;
1052
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1053
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1054
+ h[2] = ha * hb;
1055
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1056
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1057
+ h[3] = ha * hb;
1058
+ Packet4h result;
1059
+ result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1060
+ return result;
1061
+ }
1062
+
1063
+ template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
1064
+ Packet4h result;
1065
+ result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1066
+ return result;
1067
+ }
1068
+
1069
+ template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
1070
+ Packet4h result;
1071
+ result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1072
+ return result;
1073
+ }
1074
+
1075
+ template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1076
+ __int64_t r = _mm_cvtm64_si64(from.x);
1077
+ *(reinterpret_cast<__int64_t*>(to)) = r;
1078
+ }
1079
+
1080
+ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1081
+ __int64_t r = _mm_cvtm64_si64(from.x);
1082
+ *(reinterpret_cast<__int64_t*>(to)) = r;
1083
+ }
1084
+
1085
+ template<> EIGEN_STRONG_INLINE Packet4h
1086
+ ploadquad<Packet4h>(const Eigen::half* from) {
1087
+ return pset1<Packet4h>(*from);
1088
+ }
1089
+
1090
+ template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
1091
+ {
1092
+ Packet4h result;
1093
+ result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
1094
+ return result;
1095
+ }
1096
+
1097
+ template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
1098
+ {
1099
+ __int64_t a = _mm_cvtm64_si64(from.x);
1100
+ to[stride*0].x = static_cast<unsigned short>(a);
1101
+ to[stride*1].x = static_cast<unsigned short>(a >> 16);
1102
+ to[stride*2].x = static_cast<unsigned short>(a >> 32);
1103
+ to[stride*3].x = static_cast<unsigned short>(a >> 48);
1104
+ }
1105
+
1106
+ EIGEN_STRONG_INLINE void
1107
+ ptranspose(PacketBlock<Packet4h,4>& kernel) {
1108
+ __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
1109
+ __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
1110
+ __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
1111
+ __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
1112
+
1113
+ kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
1114
+ kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
1115
+ kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
1116
+ kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
1117
+ }
1118
+
1119
+ #endif
1120
+
1121
+ }
1122
+ }
1123
+
1124
+ #endif // EIGEN_PACKET_MATH_HALF_CUDA_H