tomoto 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +123 -0
- data/ext/tomoto/ext.cpp +245 -0
- data/ext/tomoto/extconf.rb +28 -0
- data/lib/tomoto.rb +12 -0
- data/lib/tomoto/ct.rb +11 -0
- data/lib/tomoto/hdp.rb +11 -0
- data/lib/tomoto/lda.rb +67 -0
- data/lib/tomoto/version.rb +3 -0
- data/vendor/EigenRand/EigenRand/Core.h +1139 -0
- data/vendor/EigenRand/EigenRand/Dists/Basic.h +111 -0
- data/vendor/EigenRand/EigenRand/Dists/Discrete.h +877 -0
- data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +108 -0
- data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +626 -0
- data/vendor/EigenRand/EigenRand/EigenRand +19 -0
- data/vendor/EigenRand/EigenRand/Macro.h +24 -0
- data/vendor/EigenRand/EigenRand/MorePacketMath.h +978 -0
- data/vendor/EigenRand/EigenRand/PacketFilter.h +286 -0
- data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +624 -0
- data/vendor/EigenRand/EigenRand/RandUtils.h +413 -0
- data/vendor/EigenRand/EigenRand/doc.h +220 -0
- data/vendor/EigenRand/LICENSE +21 -0
- data/vendor/EigenRand/README.md +288 -0
- data/vendor/eigen/COPYING.BSD +26 -0
- data/vendor/eigen/COPYING.GPL +674 -0
- data/vendor/eigen/COPYING.LGPL +502 -0
- data/vendor/eigen/COPYING.MINPACK +52 -0
- data/vendor/eigen/COPYING.MPL2 +373 -0
- data/vendor/eigen/COPYING.README +18 -0
- data/vendor/eigen/Eigen/CMakeLists.txt +19 -0
- data/vendor/eigen/Eigen/Cholesky +46 -0
- data/vendor/eigen/Eigen/CholmodSupport +48 -0
- data/vendor/eigen/Eigen/Core +537 -0
- data/vendor/eigen/Eigen/Dense +7 -0
- data/vendor/eigen/Eigen/Eigen +2 -0
- data/vendor/eigen/Eigen/Eigenvalues +61 -0
- data/vendor/eigen/Eigen/Geometry +62 -0
- data/vendor/eigen/Eigen/Householder +30 -0
- data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
- data/vendor/eigen/Eigen/Jacobi +33 -0
- data/vendor/eigen/Eigen/LU +50 -0
- data/vendor/eigen/Eigen/MetisSupport +35 -0
- data/vendor/eigen/Eigen/OrderingMethods +73 -0
- data/vendor/eigen/Eigen/PaStiXSupport +48 -0
- data/vendor/eigen/Eigen/PardisoSupport +35 -0
- data/vendor/eigen/Eigen/QR +51 -0
- data/vendor/eigen/Eigen/QtAlignedMalloc +40 -0
- data/vendor/eigen/Eigen/SPQRSupport +34 -0
- data/vendor/eigen/Eigen/SVD +51 -0
- data/vendor/eigen/Eigen/Sparse +36 -0
- data/vendor/eigen/Eigen/SparseCholesky +45 -0
- data/vendor/eigen/Eigen/SparseCore +69 -0
- data/vendor/eigen/Eigen/SparseLU +46 -0
- data/vendor/eigen/Eigen/SparseQR +37 -0
- data/vendor/eigen/Eigen/StdDeque +27 -0
- data/vendor/eigen/Eigen/StdList +26 -0
- data/vendor/eigen/Eigen/StdVector +27 -0
- data/vendor/eigen/Eigen/SuperLUSupport +64 -0
- data/vendor/eigen/Eigen/UmfPackSupport +40 -0
- data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +673 -0
- data/vendor/eigen/Eigen/src/Cholesky/LLT.h +542 -0
- data/vendor/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
- data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +639 -0
- data/vendor/eigen/Eigen/src/Core/Array.h +329 -0
- data/vendor/eigen/Eigen/src/Core/ArrayBase.h +226 -0
- data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +209 -0
- data/vendor/eigen/Eigen/src/Core/Assign.h +90 -0
- data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +935 -0
- data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +178 -0
- data/vendor/eigen/Eigen/src/Core/BandMatrix.h +353 -0
- data/vendor/eigen/Eigen/src/Core/Block.h +452 -0
- data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +164 -0
- data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +160 -0
- data/vendor/eigen/Eigen/src/Core/ConditionEstimator.h +175 -0
- data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +1688 -0
- data/vendor/eigen/Eigen/src/Core/CoreIterators.h +127 -0
- data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +184 -0
- data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +866 -0
- data/vendor/eigen/Eigen/src/Core/CwiseTernaryOp.h +197 -0
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +103 -0
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +128 -0
- data/vendor/eigen/Eigen/src/Core/DenseBase.h +611 -0
- data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +681 -0
- data/vendor/eigen/Eigen/src/Core/DenseStorage.h +570 -0
- data/vendor/eigen/Eigen/src/Core/Diagonal.h +260 -0
- data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +343 -0
- data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +28 -0
- data/vendor/eigen/Eigen/src/Core/Dot.h +318 -0
- data/vendor/eigen/Eigen/src/Core/EigenBase.h +159 -0
- data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +146 -0
- data/vendor/eigen/Eigen/src/Core/Fuzzy.h +155 -0
- data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +455 -0
- data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +593 -0
- data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +187 -0
- data/vendor/eigen/Eigen/src/Core/IO.h +225 -0
- data/vendor/eigen/Eigen/src/Core/Inverse.h +118 -0
- data/vendor/eigen/Eigen/src/Core/Map.h +171 -0
- data/vendor/eigen/Eigen/src/Core/MapBase.h +303 -0
- data/vendor/eigen/Eigen/src/Core/MathFunctions.h +1415 -0
- data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +101 -0
- data/vendor/eigen/Eigen/src/Core/Matrix.h +459 -0
- data/vendor/eigen/Eigen/src/Core/MatrixBase.h +529 -0
- data/vendor/eigen/Eigen/src/Core/NestByValue.h +110 -0
- data/vendor/eigen/Eigen/src/Core/NoAlias.h +108 -0
- data/vendor/eigen/Eigen/src/Core/NumTraits.h +248 -0
- data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +633 -0
- data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +1035 -0
- data/vendor/eigen/Eigen/src/Core/Product.h +186 -0
- data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +1112 -0
- data/vendor/eigen/Eigen/src/Core/Random.h +182 -0
- data/vendor/eigen/Eigen/src/Core/Redux.h +505 -0
- data/vendor/eigen/Eigen/src/Core/Ref.h +283 -0
- data/vendor/eigen/Eigen/src/Core/Replicate.h +142 -0
- data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +117 -0
- data/vendor/eigen/Eigen/src/Core/Reverse.h +211 -0
- data/vendor/eigen/Eigen/src/Core/Select.h +162 -0
- data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +352 -0
- data/vendor/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
- data/vendor/eigen/Eigen/src/Core/Solve.h +188 -0
- data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +235 -0
- data/vendor/eigen/Eigen/src/Core/SolverBase.h +130 -0
- data/vendor/eigen/Eigen/src/Core/StableNorm.h +221 -0
- data/vendor/eigen/Eigen/src/Core/Stride.h +111 -0
- data/vendor/eigen/Eigen/src/Core/Swap.h +67 -0
- data/vendor/eigen/Eigen/src/Core/Transpose.h +403 -0
- data/vendor/eigen/Eigen/src/Core/Transpositions.h +407 -0
- data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +983 -0
- data/vendor/eigen/Eigen/src/Core/VectorBlock.h +96 -0
- data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +695 -0
- data/vendor/eigen/Eigen/src/Core/Visitor.h +273 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +451 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +439 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +637 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +51 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +391 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1316 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +430 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +322 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +1061 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +103 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +674 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +91 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +333 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +1124 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +212 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +29 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +49 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +490 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +91 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +760 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +471 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +562 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +895 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +77 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +397 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +137 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +945 -0
- data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +168 -0
- data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +475 -0
- data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +188 -0
- data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +136 -0
- data/vendor/eigen/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
- data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +792 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2156 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +492 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +311 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +122 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +619 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
- data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +163 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +521 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +287 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +260 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +466 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +315 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +335 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +163 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +145 -0
- data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +398 -0
- data/vendor/eigen/Eigen/src/Core/util/Constants.h +547 -0
- data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +83 -0
- data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
- data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +130 -0
- data/vendor/eigen/Eigen/src/Core/util/Macros.h +1001 -0
- data/vendor/eigen/Eigen/src/Core/util/Memory.h +993 -0
- data/vendor/eigen/Eigen/src/Core/util/Meta.h +534 -0
- data/vendor/eigen/Eigen/src/Core/util/NonMPL2.h +3 -0
- data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +27 -0
- data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +218 -0
- data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +821 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +459 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +654 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +546 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +870 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +556 -0
- data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +392 -0
- data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +247 -0
- data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +114 -0
- data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +497 -0
- data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +282 -0
- data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +234 -0
- data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +195 -0
- data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +814 -0
- data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +199 -0
- data/vendor/eigen/Eigen/src/Geometry/RotationBase.h +206 -0
- data/vendor/eigen/Eigen/src/Geometry/Scaling.h +170 -0
- data/vendor/eigen/Eigen/src/Geometry/Transform.h +1542 -0
- data/vendor/eigen/Eigen/src/Geometry/Translation.h +208 -0
- data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +166 -0
- data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +161 -0
- data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +103 -0
- data/vendor/eigen/Eigen/src/Householder/Householder.h +172 -0
- data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +470 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +228 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +246 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +400 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +462 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +394 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +216 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +115 -0
- data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +462 -0
- data/vendor/eigen/Eigen/src/LU/Determinant.h +101 -0
- data/vendor/eigen/Eigen/src/LU/FullPivLU.h +891 -0
- data/vendor/eigen/Eigen/src/LU/InverseImpl.h +415 -0
- data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +611 -0
- data/vendor/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
- data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +338 -0
- data/vendor/eigen/Eigen/src/MetisSupport/MetisSupport.h +137 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +445 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +1843 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +157 -0
- data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
- data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +543 -0
- data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +653 -0
- data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
- data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +562 -0
- data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +676 -0
- data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +409 -0
- data/vendor/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
- data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +313 -0
- data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +1246 -0
- data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +804 -0
- data/vendor/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
- data/vendor/eigen/Eigen/src/SVD/SVDBase.h +315 -0
- data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +689 -0
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
- data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +377 -0
- data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +258 -0
- data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
- data/vendor/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +216 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +603 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseColEtree.h +206 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +341 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +726 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +148 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +320 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDot.h +98 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMap.h +305 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +1403 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +405 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparsePermutation.h +178 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +169 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseRedux.h +49 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +397 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +656 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseTranspose.h +92 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +178 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +478 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +253 -0
- data/vendor/eigen/Eigen/src/SparseCore/TriangularSolver.h +315 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +773 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +301 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
- data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +745 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +126 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdList.h +106 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +131 -0
- data/vendor/eigen/Eigen/src/StlSupport/details.h +84 -0
- data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +1027 -0
- data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +506 -0
- data/vendor/eigen/Eigen/src/misc/Image.h +82 -0
- data/vendor/eigen/Eigen/src/misc/Kernel.h +79 -0
- data/vendor/eigen/Eigen/src/misc/RealSvd2x2.h +55 -0
- data/vendor/eigen/Eigen/src/misc/blas.h +440 -0
- data/vendor/eigen/Eigen/src/misc/lapack.h +152 -0
- data/vendor/eigen/Eigen/src/misc/lapacke.h +16291 -0
- data/vendor/eigen/Eigen/src/misc/lapacke_mangling.h +17 -0
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +332 -0
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +552 -0
- data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +1058 -0
- data/vendor/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
- data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +163 -0
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +85 -0
- data/vendor/eigen/README.md +3 -0
- data/vendor/eigen/bench/README.txt +55 -0
- data/vendor/eigen/bench/btl/COPYING +340 -0
- data/vendor/eigen/bench/btl/README +154 -0
- data/vendor/eigen/bench/tensors/README +21 -0
- data/vendor/eigen/blas/README.txt +6 -0
- data/vendor/eigen/demos/mandelbrot/README +10 -0
- data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
- data/vendor/eigen/demos/opengl/README +13 -0
- data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1760 -0
- data/vendor/eigen/unsupported/README.txt +50 -0
- data/vendor/tomotopy/LICENSE +21 -0
- data/vendor/tomotopy/README.kr.rst +375 -0
- data/vendor/tomotopy/README.rst +382 -0
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +362 -0
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +88 -0
- data/vendor/tomotopy/src/Labeling/Labeler.h +50 -0
- data/vendor/tomotopy/src/TopicModel/CT.h +37 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +293 -0
- data/vendor/tomotopy/src/TopicModel/DMR.h +51 -0
- data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +374 -0
- data/vendor/tomotopy/src/TopicModel/DT.h +65 -0
- data/vendor/tomotopy/src/TopicModel/DTM.h +22 -0
- data/vendor/tomotopy/src/TopicModel/DTModel.cpp +15 -0
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +572 -0
- data/vendor/tomotopy/src/TopicModel/GDMR.h +37 -0
- data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +14 -0
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +485 -0
- data/vendor/tomotopy/src/TopicModel/HDP.h +74 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +592 -0
- data/vendor/tomotopy/src/TopicModel/HLDA.h +40 -0
- data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +681 -0
- data/vendor/tomotopy/src/TopicModel/HPA.h +27 -0
- data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +21 -0
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +588 -0
- data/vendor/tomotopy/src/TopicModel/LDA.h +144 -0
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +442 -0
- data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +1058 -0
- data/vendor/tomotopy/src/TopicModel/LLDA.h +45 -0
- data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +203 -0
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +63 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +17 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +558 -0
- data/vendor/tomotopy/src/TopicModel/PA.h +43 -0
- data/vendor/tomotopy/src/TopicModel/PAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +467 -0
- data/vendor/tomotopy/src/TopicModel/PLDA.h +17 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +214 -0
- data/vendor/tomotopy/src/TopicModel/SLDA.h +54 -0
- data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +17 -0
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +456 -0
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +692 -0
- data/vendor/tomotopy/src/Utils/AliasMethod.hpp +169 -0
- data/vendor/tomotopy/src/Utils/Dictionary.h +80 -0
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +181 -0
- data/vendor/tomotopy/src/Utils/LBFGS.h +202 -0
- data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBacktracking.h +120 -0
- data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBracketing.h +122 -0
- data/vendor/tomotopy/src/Utils/LBFGS/Param.h +213 -0
- data/vendor/tomotopy/src/Utils/LUT.hpp +82 -0
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +69 -0
- data/vendor/tomotopy/src/Utils/PolyaGamma.hpp +200 -0
- data/vendor/tomotopy/src/Utils/PolyaGammaHybrid.hpp +672 -0
- data/vendor/tomotopy/src/Utils/ThreadPool.hpp +150 -0
- data/vendor/tomotopy/src/Utils/Trie.hpp +220 -0
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +94 -0
- data/vendor/tomotopy/src/Utils/Utils.hpp +337 -0
- data/vendor/tomotopy/src/Utils/avx_gamma.h +46 -0
- data/vendor/tomotopy/src/Utils/avx_mathfun.h +736 -0
- data/vendor/tomotopy/src/Utils/exception.h +28 -0
- data/vendor/tomotopy/src/Utils/math.h +281 -0
- data/vendor/tomotopy/src/Utils/rtnorm.hpp +2690 -0
- data/vendor/tomotopy/src/Utils/sample.hpp +192 -0
- data/vendor/tomotopy/src/Utils/serializer.hpp +695 -0
- data/vendor/tomotopy/src/Utils/slp.hpp +131 -0
- data/vendor/tomotopy/src/Utils/sse_gamma.h +48 -0
- data/vendor/tomotopy/src/Utils/sse_mathfun.h +710 -0
- data/vendor/tomotopy/src/Utils/text.hpp +49 -0
- data/vendor/tomotopy/src/Utils/tvector.hpp +543 -0
- metadata +531 -0
@@ -0,0 +1,50 @@
|
|
1
|
+
This directory contains contributions from various users.
|
2
|
+
They are provided "as is", without any support. Nevertheless,
|
3
|
+
most of them are subject to be included in Eigen in the future.
|
4
|
+
|
5
|
+
In order to use an unsupported module you have to do either:
|
6
|
+
|
7
|
+
- add the path_to_eigen/unsupported directory to your include path and do:
|
8
|
+
#include <Eigen/ModuleHeader>
|
9
|
+
|
10
|
+
- or directly do:
|
11
|
+
#include <unsupported/Eigen/ModuleHeader>
|
12
|
+
|
13
|
+
|
14
|
+
If you are interested in contributing to one of them, or have other stuff
|
15
|
+
you would like to share, feel free to contact us:
|
16
|
+
http://eigen.tuxfamily.org/index.php?title=Main_Page#Mailing_list
|
17
|
+
|
18
|
+
Any kind of contributions are much appreciated, even very preliminary ones.
|
19
|
+
However, it:
|
20
|
+
- must rely on Eigen,
|
21
|
+
- must be highly related to math,
|
22
|
+
- should have some general purpose in the sense that it could
|
23
|
+
potentially become an offical Eigen module (or be merged into another one).
|
24
|
+
|
25
|
+
In doubt feel free to contact us. For instance, if your addons is very too specific
|
26
|
+
but it shows an interesting way of using Eigen, then it could be a nice demo.
|
27
|
+
|
28
|
+
|
29
|
+
This directory is organized as follow:
|
30
|
+
|
31
|
+
unsupported/Eigen/ModuleHeader1
|
32
|
+
unsupported/Eigen/ModuleHeader2
|
33
|
+
unsupported/Eigen/...
|
34
|
+
unsupported/Eigen/src/Module1/SourceFile1.h
|
35
|
+
unsupported/Eigen/src/Module1/SourceFile2.h
|
36
|
+
unsupported/Eigen/src/Module1/...
|
37
|
+
unsupported/Eigen/src/Module2/SourceFile1.h
|
38
|
+
unsupported/Eigen/src/Module2/SourceFile2.h
|
39
|
+
unsupported/Eigen/src/Module2/...
|
40
|
+
unsupported/Eigen/src/...
|
41
|
+
unsupported/doc/snippets/.cpp <- code snippets for the doc
|
42
|
+
unsupported/doc/examples/.cpp <- examples for the doc
|
43
|
+
unsupported/doc/TutorialModule1.dox
|
44
|
+
unsupported/doc/TutorialModule2.dox
|
45
|
+
unsupported/doc/...
|
46
|
+
unsupported/test/.cpp <- unit test files
|
47
|
+
|
48
|
+
The documentation is generated at the same time than the main Eigen documentation.
|
49
|
+
The .html files are generated in: build_dir/doc/html/unsupported/
|
50
|
+
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2019
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,375 @@
|
|
1
|
+
tomotopy
|
2
|
+
========
|
3
|
+
|
4
|
+
.. image:: https://badge.fury.io/py/tomotopy.svg
|
5
|
+
:target: https://pypi.python.org/pypi/tomotopy
|
6
|
+
|
7
|
+
.. image:: https://zenodo.org/badge/186155463.svg
|
8
|
+
:target: https://zenodo.org/badge/latestdoi/186155463
|
9
|
+
|
10
|
+
🎌
|
11
|
+
`English`_,
|
12
|
+
**한국어**.
|
13
|
+
|
14
|
+
.. _English: README.rst
|
15
|
+
|
16
|
+
tomotopy 란?
|
17
|
+
------------------
|
18
|
+
`tomotopy`는 토픽 모델링 툴인 `tomoto`의 Python 확장 버전입니다. `tomoto`는 c++로 작성된 깁스 샘플링 기반의 토픽 모델링 라이브러리로,
|
19
|
+
최신 CPU의 벡터화 기술을 활용하여 처리 속도를 최대로 끌어올렸습니다.
|
20
|
+
현재 버전의 `tomoto`에서는 다음과 같은 주요 토픽 모델들을 지원하고 있습니다.
|
21
|
+
|
22
|
+
* Latent Dirichlet Allocation (`tomotopy.LDAModel`)
|
23
|
+
* Labeled LDA (`tomotopy.LLDAModel`)
|
24
|
+
* Partially Labeled LDA (`tomotopy.PLDAModel`)
|
25
|
+
* Supervised LDA (`tomotopy.SLDAModel`)
|
26
|
+
* Dirichlet Multinomial Regression (`tomotopy.DMRModel`)
|
27
|
+
* Generalized Dirichlet Multinomial Regression (`tomotopy.GDMRModel`)
|
28
|
+
* Hierarchical Dirichlet Process (`tomotopy.HDPModel`)
|
29
|
+
* Hierarchical LDA (`tomotopy.HLDAModel`)
|
30
|
+
* Multi Grain LDA (`tomotopy.MGLDAModel`)
|
31
|
+
* Pachinko Allocation (`tomotopy.PAModel`)
|
32
|
+
* Hierarchical PA (`tomotopy.HPAModel`)
|
33
|
+
* Correlated Topic Model (`tomotopy.CTModel`)
|
34
|
+
* Dynamic Topic Model (`tomotopy.DTModel`)
|
35
|
+
|
36
|
+
더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
|
37
|
+
|
38
|
+
tomotopy의 가장 최신버전은 0.9.1 입니다.
|
39
|
+
|
40
|
+
시작하기
|
41
|
+
---------------
|
42
|
+
다음과 같이 pip를 이용하면 tomotopy를 쉽게 설치할 수 있습니다.
|
43
|
+
::
|
44
|
+
|
45
|
+
$ pip install --upgrade pip
|
46
|
+
$ pip install tomotopy
|
47
|
+
|
48
|
+
지원하는 운영체제 및 Python 버전은 다음과 같습니다:
|
49
|
+
|
50
|
+
* Python 3.5 이상이 설치된 Linux (x86-64)
|
51
|
+
* Python 3.5 이상이 설치된 macOS 10.13나 그 이후 버전
|
52
|
+
* Python 3.5 이상이 설치된 Windows 7이나 그 이후 버전 (x86, x86-64)
|
53
|
+
* Python 3.5 이상이 설치된 다른 운영체제: 이 경우는 c++11 호환 컴파일러를 통한 소스코드 컴파일이 필요합니다.
|
54
|
+
|
55
|
+
설치가 끝난 뒤에는 다음과 같이 Python3에서 바로 import하여 tomotopy를 사용할 수 있습니다.
|
56
|
+
::
|
57
|
+
|
58
|
+
import tomotopy as tp
|
59
|
+
print(tp.isa) # 'avx2'나 'avx', 'sse2', 'none'를 출력합니다.
|
60
|
+
|
61
|
+
현재 tomotopy는 가속을 위해 AVX2, AVX or SSE2 SIMD 명령어 세트를 활용할 수 있습니다.
|
62
|
+
패키지가 import될 때 현재 환경에서 활용할 수 있는 최선의 명령어 세트를 확인하여 최상의 모듈을 자동으로 가져옵니다.
|
63
|
+
만약 `tp.isa`가 `none`이라면 현재 환경에서 활용 가능한 SIMD 명령어 세트가 없는 것이므로 훈련에 오랜 시간이 걸릴 수 있습니다.
|
64
|
+
그러나 최근 대부분의 Intel 및 AMD CPU에서는 SIMD 명령어 세트를 지원하므로 SIMD 가속이 성능을 크게 향상시킬 수 있을 것입니다.
|
65
|
+
|
66
|
+
간단한 예제로 'sample.txt' 파일로 LDA 모델을 학습하는 코드는 다음과 같습니다.
|
67
|
+
::
|
68
|
+
|
69
|
+
import tomotopy as tp
|
70
|
+
mdl = tp.LDAModel(k=20)
|
71
|
+
for line in open('sample.txt'):
|
72
|
+
mdl.add_doc(line.strip().split())
|
73
|
+
|
74
|
+
for i in range(0, 100, 10):
|
75
|
+
mdl.train(10)
|
76
|
+
print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
|
77
|
+
|
78
|
+
for k in range(mdl.k):
|
79
|
+
print('Top 10 words of topic #{}'.format(k))
|
80
|
+
print(mdl.get_topic_words(k, top_n=10))
|
81
|
+
|
82
|
+
mdl.summary()
|
83
|
+
|
84
|
+
tomotopy의 성능
|
85
|
+
-----------------------
|
86
|
+
`tomotopy`는 주제 분포와 단어 분포를 추론하기 위해 Collapsed Gibbs-Sampling(CGS) 기법을 사용합니다.
|
87
|
+
일반적으로 CGS는 `gensim의 LdaModel`_가 이용하는 Variational Bayes(VB) 보다 느리게 수렴하지만 각각의 반복은 빠르게 계산 가능합니다.
|
88
|
+
게다가 `tomotopy`는 멀티스레드를 지원하므로 SIMD 명령어 세트뿐만 아니라 다중 코어 CPU의 장점까지 활용할 수 있습니다. 이 덕분에 각각의 반복이 훨씬 빠르게 계산 가능합니다.
|
89
|
+
|
90
|
+
.. _gensim의 LdaModel: https://radimrehurek.com/gensim/models/ldamodel.html
|
91
|
+
|
92
|
+
다음의 차트는 `tomotopy`와 `gensim`의 LDA 모형 실행 시간을 비교하여 보여줍니다.
|
93
|
+
입력 문헌은 영어 위키백과에서 가져온 1000개의 임의 문서이며 전체 문헌 집합은 총 1,506,966개의 단어로 구성되어 있습니다. (약 10.1 MB).
|
94
|
+
`tomotopy`는 200회를, `gensim` 10회를 반복 학습하였습니다.
|
95
|
+
|
96
|
+
.. image:: https://bab2min.github.io/tomotopy/images/tmt_i5.png
|
97
|
+
|
98
|
+
Intel i5-6600, x86-64 (4 cores)에서의 성능
|
99
|
+
|
100
|
+
.. image:: https://bab2min.github.io/tomotopy/images/tmt_xeon.png
|
101
|
+
|
102
|
+
Intel Xeon E5-2620 v4, x86-64 (8 cores, 16 threads)에서의 성능
|
103
|
+
|
104
|
+
`tomotopy`가 20배 더 많이 반복하였지만 전체 실행시간은 `gensim`보다 5~10배 더 빨랐습니다. 또한 `tomotopy`는 전반적으로 안정적인 결과를 보여주고 있습니다.
|
105
|
+
|
106
|
+
CGS와 VB는 서로 접근방법이 아예 다른 기법이기 때문에 둘을 직접적으로 비교하기는 어렵습니다만, 실용적인 관점에서 두 기법의 속도와 결과물을 비교해볼 수 있습니다.
|
107
|
+
다음의 차트에는 두 기법이 학습 후 보여준 단어당 로그 가능도 값이 표현되어 있습니다.
|
108
|
+
|
109
|
+
.. image:: https://bab2min.github.io/tomotopy/images/LLComp.png
|
110
|
+
|
111
|
+
어떤 SIMD 명령어 세트를 사용하는지는 성능에 큰 영향을 미칩니다.
|
112
|
+
다음 차트는 SIMD 명령어 세트에 따른 성능 차이를 보여줍니다.
|
113
|
+
|
114
|
+
.. image:: https://bab2min.github.io/tomotopy/images/SIMDComp.png
|
115
|
+
|
116
|
+
다행히도 최신 x86-64 CPU들은 대부분 AVX2 명령어 세트를 지원하기 때문에 대부분의 경우 AVX2의 높은 성능을 활용할 수 있을 것입니다.
|
117
|
+
|
118
|
+
모델의 저장과 불러오기
|
119
|
+
-------------------
|
120
|
+
`tomotopy`는 각각의 토픽 모델 클래스에 대해 `save`와 `load` 메소드를 제공합니다.
|
121
|
+
따라서 학습이 끝난 모델을 언제든지 파일에 저장하거나, 파일로부터 다시 읽어와서 다양한 작업을 수행할 수 있습니다.
|
122
|
+
::
|
123
|
+
|
124
|
+
import tomotopy as tp
|
125
|
+
|
126
|
+
mdl = tp.HDPModel()
|
127
|
+
for line in open('sample.txt'):
|
128
|
+
mdl.add_doc(line.strip().split())
|
129
|
+
|
130
|
+
for i in range(0, 100, 10):
|
131
|
+
mdl.train(10)
|
132
|
+
print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
|
133
|
+
|
134
|
+
# 파일에 저장
|
135
|
+
mdl.save('sample_hdp_model.bin')
|
136
|
+
|
137
|
+
# 파일로부터 불러오기
|
138
|
+
mdl = tp.HDPModel.load('sample_hdp_model.bin')
|
139
|
+
for k in range(mdl.k):
|
140
|
+
if not mdl.is_live_topic(k): continue
|
141
|
+
print('Top 10 words of topic #{}'.format(k))
|
142
|
+
print(mdl.get_topic_words(k, top_n=10))
|
143
|
+
|
144
|
+
# 저장된 모델이 HDP 모델이었기 때문에,
|
145
|
+
# LDA 모델에서 이 파일을 읽어오려고 하면 예외가 발생합니다.
|
146
|
+
mdl = tp.LDAModel.load('sample_hdp_model.bin')
|
147
|
+
|
148
|
+
파일로부터 모델을 불러올 때는 반드시 저장된 모델의 타입과 읽어올 모델의 타입이 일치해야합니다.
|
149
|
+
|
150
|
+
이에 대해서는 `tomotopy.LDAModel.save`와 `tomotopy.LDAModel.load`에서 더 자세한 내용을 확인할 수 있습니다.
|
151
|
+
|
152
|
+
모델 안의 문헌과 모델 밖의 문헌
|
153
|
+
-------------------------------------------
|
154
|
+
토픽 모델은 크게 2가지 목적으로 사용할 수 있습니다.
|
155
|
+
기본적으로는 문헌 집합으로부터 모델을 학습하여 문헌 내의 주제들을 발견하기 위해 토픽 모델을 사용할 수 있으며,
|
156
|
+
더 나아가 학습된 모델을 활용하여 학습할 때는 주어지지 않았던 새로운 문헌에 대해 주제 분포를 추론하는 것도 가능합니다.
|
157
|
+
전자의 과정에서 사용되는 문헌(학습 과정에서 사용되는 문헌)을 **모델 안의 문헌**,
|
158
|
+
후자의 과정에서 주어지는 새로운 문헌(학습 과정에 포함되지 않았던 문헌)을 **모델 밖의 문헌**이라고 가리키도록 하겠습니다.
|
159
|
+
|
160
|
+
`tomotopy`에서 이 두 종류의 문헌을 생성하는 방법은 다릅니다. **모델 안의 문헌**은 `tomotopy.LDAModel.add_doc`을 이용하여 생성합니다.
|
161
|
+
add_doc은 `tomotopy.LDAModel.train`을 시작하기 전까지만 사용할 수 있습니다.
|
162
|
+
즉 train을 시작한 이후로는 학습 문헌 집합이 고정되기 때문에 add_doc을 이용하여 새로운 문헌을 모델 내에 추가할 수 없습니다.
|
163
|
+
|
164
|
+
또한 생성된 문헌의 인스턴스를 얻기 위해서는 다음과 같이 `tomotopy.LDAModel.docs`를 사용해야 합니다.
|
165
|
+
|
166
|
+
::
|
167
|
+
|
168
|
+
mdl = tp.LDAModel(k=20)
|
169
|
+
idx = mdl.add_doc(words)
|
170
|
+
if idx < 0: raise RuntimeError("Failed to add doc")
|
171
|
+
doc_inst = mdl.docs[idx]
|
172
|
+
# doc_inst is an instance of the added document
|
173
|
+
|
174
|
+
**모델 밖의 문헌**은 `tomotopy.LDAModel.make_doc`을 이용해 생성합니다. make_doc은 add_doc과 반대로 train을 시작한 이후에 사용할 수 있습니다.
|
175
|
+
만약 train을 시작하기 전에 make_doc을 사용할 경우 올바르지 않은 결과를 얻게 되니 이 점 유의하시길 바랍니다. make_doc은 바로 인스턴스를 반환하므로 반환값을 받아 바로 사용할 수 있습니다.
|
176
|
+
|
177
|
+
::
|
178
|
+
|
179
|
+
mdl = tp.LDAModel(k=20)
|
180
|
+
# add_doc ...
|
181
|
+
mdl.train(100)
|
182
|
+
doc_inst = mdl.make_doc(unseen_doc) # doc_inst is an instance of the unseen document
|
183
|
+
|
184
|
+
새로운 문헌에 대해 추론하기
|
185
|
+
------------------------------
|
186
|
+
`tomotopy.LDAModel.make_doc`을 이용해 새로운 문헌을 생성했다면 이를 모델에 입력해 주제 분포를 추론하도록 할 수 있습니다.
|
187
|
+
새로운 문헌에 대한 추론은 `tomotopy.LDAModel.infer`를 사용합니다.
|
188
|
+
|
189
|
+
::
|
190
|
+
|
191
|
+
mdl = tp.LDAModel(k=20)
|
192
|
+
# add_doc ...
|
193
|
+
mdl.train(100)
|
194
|
+
doc_inst = mdl.make_doc(unseen_doc)
|
195
|
+
topic_dist, ll = mdl.infer(doc_inst)
|
196
|
+
print("Topic Distribution for Unseen Docs: ", topic_dist)
|
197
|
+
print("Log-likelihood of inference: ", ll)
|
198
|
+
|
199
|
+
infer 메소드는 `tomotopy.Document` 인스턴스 하나를 추론하거나 `tomotopy.Document` 인스턴스의 `list`를 추론하는데 사용할 수 있습니다.
|
200
|
+
자세한 것은 `tomotopy.LDAModel.infer`을 참조하길 바랍니다.
|
201
|
+
|
202
|
+
병렬 샘플링 알고리즘
|
203
|
+
----------------------------
|
204
|
+
`tomotopy`는 0.5.0버전부터 병렬 알고리즘을 고를 수 있는 선택지를 제공합니다.
|
205
|
+
0.4.2 이전버전까지 제공되던 알고리즘은 `COPY_MERGE`로 이 기법은 모든 토픽 모델에 사용 가능합니다.
|
206
|
+
새로운 알고리즘인 `PARTITION`은 0.5.0이후부터 사용가능하며, 이를 사용하면 더 빠르고 메모리 효율적으로 학습을 수행할 수 있습니다. 단 이 기법은 일부 토픽 모델에 대해서만 사용 가능합니다.
|
207
|
+
|
208
|
+
다음 차트는 토픽 개수와 코어 개수에 따라 두 기법의 속도 차이를 보여줍니다.
|
209
|
+
|
210
|
+
.. image:: https://bab2min.github.io/tomotopy/images/algo_comp.png
|
211
|
+
|
212
|
+
.. image:: https://bab2min.github.io/tomotopy/images/algo_comp2.png
|
213
|
+
|
214
|
+
버전별 속도 차이
|
215
|
+
----------------------
|
216
|
+
아래 그래프는 버전별 속도 차이를 표시한 것입니다.
|
217
|
+
LDA모델로 1000회 iteration을 수행시 걸리는 시간을 초 단위로 표시하였습니다.
|
218
|
+
(Docs: 11314, Vocab: 60382, Words: 2364724, Intel Xeon Gold 5120 @2.2GHz)
|
219
|
+
|
220
|
+
.. image:: https://bab2min.github.io/tomotopy/images/lda-perf-t1.png
|
221
|
+
|
222
|
+
.. image:: https://bab2min.github.io/tomotopy/images/lda-perf-t4.png
|
223
|
+
|
224
|
+
.. image:: https://bab2min.github.io/tomotopy/images/lda-perf-t8.png
|
225
|
+
|
226
|
+
어휘 사전분포를 이용하여 주제 고정하기
|
227
|
+
--------------------------------------
|
228
|
+
0.6.0 버전부터 `tomotopy.LDAModel.set_word_prior`라는 메소드가 추가되었습니다. 이 메소드로 특정 단어의 사전분포를 조절할 수 있습니다.
|
229
|
+
예를 들어 다음 코드처럼 단어 'church'의 가중치를 Topic 0에 대해서는 1.0, 나머지 Topic에 대해서는 0.1로 설정할 수 있습니다.
|
230
|
+
이는 단어 'church'가 Topic 0에 할당될 확률이 다른 Topic에 할당될 확률보다 10배 높다는 것을 의미하며, 따라서 대부분의 'church'는 Topic 0에 할당되게 됩니다.
|
231
|
+
그리고 학습을 거치며 'church'와 관련된 단어들 역시 Topic 0에 모이게 되므로, 최종적으로 Topic 0은 'church'와 관련된 주제가 될 것입니다.
|
232
|
+
이를 통해 특정 내용의 주제를 원하는 Topic 번호에 고정시킬 수 있습니다.
|
233
|
+
|
234
|
+
::
|
235
|
+
|
236
|
+
import tomotopy as tp
|
237
|
+
mdl = tp.LDAModel(k=20)
|
238
|
+
|
239
|
+
# add documents into `mdl`
|
240
|
+
|
241
|
+
# setting word prior
|
242
|
+
mdl.set_word_prior('church', [1.0 if k == 0 else 0.1 for k in range(20)])
|
243
|
+
|
244
|
+
자세한 내용은 `example.py`의 `word_prior_example` 함수를 참조하십시오.
|
245
|
+
|
246
|
+
예제 코드
|
247
|
+
---------
|
248
|
+
tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/master/examples/ 를 확인하시길 바랍니다.
|
249
|
+
|
250
|
+
예제 코드에서 사용했던 데이터 파일은 https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view 에서 다운받을 수 있습니다.
|
251
|
+
|
252
|
+
라이센스
|
253
|
+
---------
|
254
|
+
`tomotopy`는 MIT License 하에 배포됩니다.
|
255
|
+
|
256
|
+
역사
|
257
|
+
-------
|
258
|
+
* 0.9.1 (2020-08-08)
|
259
|
+
* 0.9.0 버전의 메모리 누수 문제가 해결되었습니다.
|
260
|
+
* `tomotopy.CTModel.summary()`가 잘못된 결과를 출력하는 문제가 해결되었습니다.
|
261
|
+
|
262
|
+
* 0.9.0 (2020-08-04)
|
263
|
+
* 모델의 상태를 알아보기 쉽게 출력해주는 `tomotopy.LDAModel.summary()` 메소드가 추가되었습니다.
|
264
|
+
* 난수 생성기를 `EigenRand`_로 대체하여 생성 속도를 높이고 플랫폼 간의 결과 차이를 해소하였습니다.
|
265
|
+
* 이로 인해 `seed`가 동일해도 모델 학습 결과가 0.9.0 이전 버전과 달라질 수 있습니다.
|
266
|
+
* `tomotopy.HDPModel`에서 간헐적으로 발생하는 학습 오류를 수정했습니다.
|
267
|
+
* 이제 `tomotopy.DMRModel.alpha`가 메타데이터별 토픽 분포의 사전 파라미터를 보여줍니다.
|
268
|
+
* `tomotopy.DTModel.get_count_by_topics()`가 2차원 `ndarray`를 반환하도록 수정되었습니다.
|
269
|
+
* `tomotopy.DTModel.alpha`가 `tomotopy.DTModel.get_alpha()`와 동일한 값을 반환하도록 수정되었습니다.
|
270
|
+
* `tomotopy.GDMRModel`의 document에 대해 `metadata` 값을 얻어올 수 없던 문제가 해결되었습니다.
|
271
|
+
* 이제 `tomotopy.HLDAModel.alpha`가 문헌별 계층 분포의 사전 파라미터를 보여줍니다.
|
272
|
+
* `tomotopy.LDAModel.global_step`이 추가되었습니다.
|
273
|
+
* 이제 `tomotopy.MGLDAModel.get_count_by_topics()`가 전역 토픽과 지역 토픽 모두의 단어 개수를 보여줍니다.
|
274
|
+
* `tomotopy.PAModel.alpha`, `tomotopy.PAModel.subalpha`, `tomotopy.PAModel.get_count_by_super_topic()`이 추가되었습니다.
|
275
|
+
|
276
|
+
.. _EigenRand: https://github.com/bab2min/EigenRand
|
277
|
+
|
278
|
+
* 0.8.2 (2020-07-14)
|
279
|
+
* `tomotopy.DTModel.num_timepoints`와 `tomotopy.DTModel.num_docs_by_timepoint` 프로퍼티가 추가되었습니다.
|
280
|
+
* `seed`가 동일해서 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.
|
281
|
+
|
282
|
+
* 0.8.1 (2020-06-08)
|
283
|
+
* `tomotopy.LDAModel.used_vocabs`가 잘못된 값을 반환하는 버그가 수정되었습니다.
|
284
|
+
* 이제 `tomotopy.CTModel.prior_cov`가 `[k, k]` 모양의 공분산 행렬을 반환합니다.
|
285
|
+
* 이제 인자 없이 `tomotopy.CTModel.get_correlations`를 호출하면 `[k, k]` 모양의 상관관계 행렬을 반환합니다.
|
286
|
+
|
287
|
+
* 0.8.0 (2020-06-06)
|
288
|
+
* NumPy가 tomotopy에 도입됨에 따라 많은 메소드와 프로퍼티들이 `list`가 아니라 `numpy.ndarray`를 반환하도록 변경되었습니다.
|
289
|
+
* Tomotopy에 새 의존관계 `NumPy >= 1.10.0`가 추가되었습니다..
|
290
|
+
* `tomotopy.HDPModel.infer`가 잘못된 추론을 하던 문제가 수정되었습니다.
|
291
|
+
* HDP 모델을 LDA 모델로 변환하는 메소드가 추가되었습니다.
|
292
|
+
* `tomotopy.LDAModel.used_vocabs`, `tomotopy.LDAModel.used_vocab_freq`, `tomotopy.LDAModel.used_vocab_df` 등의 새로운 프로퍼티가 모델에 추가되었습니다.
|
293
|
+
* 새로운 토픽 모델인 g-DMR(`tomotopy.GDMRModel`)가 추가되었습니다.
|
294
|
+
* macOS에서 `tomotopy.label.FoRelevance`를 생성할 때 발생하던 문제가 해결되었습니다.
|
295
|
+
* `tomotopy.utils.Corpus.add_doc`로 `raw`가 없는 문헌을 생성한 뒤 토픽 모델에 입력할 시 발생하는 오류를 수정했습니다.
|
296
|
+
|
297
|
+
* 0.7.1 (2020-05-08)
|
298
|
+
* `tomotopy.HLDAModel`용으로 `tomotopy.Document.path`가 새로 추가되었습니다.
|
299
|
+
* `tomotopy.label.PMIExtractor` 사용시에 발생하던 메모리 문제가 해결되었습니다.
|
300
|
+
* gcc 7에서 발생하던 컴파일 오류가 해결되었습니다.
|
301
|
+
|
302
|
+
* 0.7.0 (2020-04-18)
|
303
|
+
* `tomotopy.DTModel`이 추가되었습니다.
|
304
|
+
* `tomotopy.utils.Corpus.save`가 제대로 작동하지 않는 버그가 수정되었습니다.
|
305
|
+
* `tomotopy.LDAModel.get_count_vector`가 추가되었습니다.
|
306
|
+
* 리눅스용 바이너리가 manylinux2010 버전으로 변경되었고 이에 따른 최적화가 진행되었습니다.
|
307
|
+
|
308
|
+
* 0.6.2 (2020-03-28)
|
309
|
+
* `save`와 `load`에 관련된 치명적인 버그가 수정되었습니다. 해당 버그로 0.6.0 및 0.6.1 버전은 릴리즈에서 삭제되었습니다.
|
310
|
+
|
311
|
+
* 0.6.1 (2020-03-22) (삭제됨)
|
312
|
+
* 모듈 로딩과 관련된 버그가 수정되었습니다.
|
313
|
+
|
314
|
+
* 0.6.0 (2020-03-22) (삭제됨)
|
315
|
+
* 대량의 문헌을 관리하기 위한 `tomotopy.utils.Corpus`가 추가되었습니다.
|
316
|
+
* 어휘-주제 분포의 사전 확률을 조절할 수 있는 `tomotopy.LDAModel.set_word_prior` 메소드가 추가되었습니다.
|
317
|
+
* 문헌 빈도를 기반으로 어휘를 필터링할 수 있도록 토픽 모델의 생성자에 `min_df`가 추가되었습니다.
|
318
|
+
* 토픽 라벨링 관련 서브모듈인 `tomotopy.label`이 추가되었습니다. 현재는 `tomotopy.label.FoRelevance`만 제공됩니다.
|
319
|
+
|
320
|
+
* 0.5.2 (2020-03-01)
|
321
|
+
* `tomotopy.LLDAModel.add_doc` 실행시 segmentation fault가 발생하는 문제를 해결했습니다.
|
322
|
+
* `tomotopy.HDPModel`에서 `infer` 실행시 종종 프로그램이 종료되는 문제를 해결했습니다.
|
323
|
+
* `tomotopy.LDAModel.infer`에서 ps=tomotopy.ParallelScheme.PARTITION, together=True로 실행시 발생하는 오류를 해결했습니다.
|
324
|
+
|
325
|
+
* 0.5.1 (2020-01-11)
|
326
|
+
* `tomotopy.SLDAModel.make_doc`에서 결측값을 지원하지 않던 문제를 해결했습니다.
|
327
|
+
* `tomotopy.SLDAModel`이 이제 결측값을 지원합니다. 결측값을 가진 문헌은 토픽 모델링에는 참여하지만, 응답 변수 회귀에서는 제외됩니다.
|
328
|
+
|
329
|
+
* 0.5.0 (2019-12-30)
|
330
|
+
* `tomotopy.PAModel.infer`가 topic distribution과 sub-topic distribution을 동시에 반환합니다.
|
331
|
+
* `tomotopy.Document`에 get_sub_topics, get_sub_topic_dist 메소드가 추가되었습니다. (PAModel 전용)
|
332
|
+
* `tomotopy.LDAModel.train` 및 `tomotopy.LDAModel.infer` 메소드에 parallel 옵션이 추가되었습니다. 이를 통해 학습 및 추론시 사용할 병렬화 알고리즘을 선택할 수 있습니다.
|
333
|
+
* `tomotopy.ParallelScheme.PARTITION` 알고리즘이 추가되었습니다. 이 알고리즘은 작업자 수가 많거나 토픽의 개수나 어휘 크기가 클 때도 효율적으로 작동합니다.
|
334
|
+
* 모델 생성시 min_cf < 2일때 rm_top 옵션이 적용되지 않는 문제를 수정하였습니다.
|
335
|
+
|
336
|
+
* 0.4.2 (2019-11-30)
|
337
|
+
* `tomotopy.LLDAModel`와 `tomotopy.PLDAModel` 모델에서 토픽 할당이 잘못 일어나던 문제를 해결했습니다.
|
338
|
+
* `tomotopy.Document` 및 `tomotopy.Dictionary` 클래스에 가독성이 좋은 __repr__가 추가되었습니다.
|
339
|
+
|
340
|
+
* 0.4.1 (2019-11-27)
|
341
|
+
* `tomotopy.PLDAModel` 생성자의 버그를 수정했습니다.
|
342
|
+
|
343
|
+
* 0.4.0 (2019-11-18)
|
344
|
+
* `tomotopy.PLDAModel`와 `tomotopy.HLDAModel` 토픽 모델이 새로 추가되었습니다.
|
345
|
+
|
346
|
+
* 0.3.1 (2019-11-05)
|
347
|
+
* `min_cf` 혹은 `rm_top`가 설정되었을 때 `get_topic_dist()`의 반환값이 부정확한 문제를 수정하였습니다.
|
348
|
+
* `tomotopy.MGLDAModel` 모델의 문헌의 `get_topic_dist()`가 지역 토픽에 대한 분포도 함께 반환하도록 수정하였습니다..
|
349
|
+
* `tw=ONE`일때의 학습 속도가 개선되었습니다.
|
350
|
+
|
351
|
+
* 0.3.0 (2019-10-06)
|
352
|
+
* `tomotopy.LLDAModel` 토픽 모델이 새로 추가되었습니다.
|
353
|
+
* `HDPModel`을 학습할 때 프로그램이 종료되는 문제를 해결했습니다.
|
354
|
+
* `HDPModel`의 하이퍼파라미터 추정 기능이 추가되었습니다. 이 때문에 새 버전의 `HDPModel` 결과는 이전 버전과 다를 수 있습니다.
|
355
|
+
이전 버전처럼 하이퍼파라미터 추정을 끄려면, `optim_interval`을 0으로 설정하십시오.
|
356
|
+
|
357
|
+
* 0.2.0 (2019-08-18)
|
358
|
+
* `tomotopy.CTModel`와 `tomotopy.SLDAModel` 토픽 모델이 새로 추가되었습니다.
|
359
|
+
* `rm_top` 파라미터 옵션이 모든 토픽 모델에 추가되었습니다.
|
360
|
+
* `PAModel`과 `HPAModel` 모델에서 `save`와 `load`가 제대로 작동하지 않는 문제를 해결하였습니다.
|
361
|
+
* `HDPModel` 인스턴스를 파일로부터 로딩할 때 종종 프로그램이 종료되는 문제를 해결하였습니다.
|
362
|
+
* `min_cf` > 0으로 설정하였을 때 `ll_per_word` 값이 잘못 계산되는 문제를 해결하였습니다.
|
363
|
+
|
364
|
+
* 0.1.6 (2019-08-09)
|
365
|
+
* macOS와 clang에서 제대로 컴파일되지 않는 문제를 해결했습니다.
|
366
|
+
|
367
|
+
* 0.1.4 (2019-08-05)
|
368
|
+
* `add_doc` 메소드가 빈 리스트를 받았을 때 발생하는 문제를 해결하였습니다.
|
369
|
+
* `tomotopy.PAModel.get_topic_words`가 하위토픽의 단어 분포를 제대로 반환하지 못하는 문제를 해결하였습니다.
|
370
|
+
|
371
|
+
* 0.1.3 (2019-05-19)
|
372
|
+
* `min_cf` 파라미터와 불용어 제거 기능이 모든 토픽 모델에 추가되었습니다.
|
373
|
+
|
374
|
+
* 0.1.0 (2019-05-12)
|
375
|
+
* **tomotopy**의 최초 버전
|
@@ -0,0 +1,382 @@
|
|
1
|
+
tomotopy
|
2
|
+
========
|
3
|
+
|
4
|
+
.. image:: https://badge.fury.io/py/tomotopy.svg
|
5
|
+
:target: https://pypi.python.org/pypi/tomotopy
|
6
|
+
|
7
|
+
.. image:: https://zenodo.org/badge/186155463.svg
|
8
|
+
:target: https://zenodo.org/badge/latestdoi/186155463
|
9
|
+
|
10
|
+
🎌
|
11
|
+
**English**,
|
12
|
+
`한국어`_.
|
13
|
+
|
14
|
+
.. _한국어: README.kr.rst
|
15
|
+
|
16
|
+
What is tomotopy?
|
17
|
+
------------------
|
18
|
+
|
19
|
+
`tomotopy` is a Python extension of `tomoto` (Topic Modeling Tool) which is a Gibbs-sampling based topic model library written in C++.
|
20
|
+
It utilizes a vectorization of modern CPUs for maximizing speed.
|
21
|
+
The current version of `tomoto` supports several major topic models including
|
22
|
+
|
23
|
+
* Latent Dirichlet Allocation (`tomotopy.LDAModel`)
|
24
|
+
* Labeled LDA (`tomotopy.LLDAModel`)
|
25
|
+
* Partially Labeled LDA (`tomotopy.PLDAModel`)
|
26
|
+
* Supervised LDA (`tomotopy.SLDAModel`)
|
27
|
+
* Dirichlet Multinomial Regression (`tomotopy.DMRModel`)
|
28
|
+
* Generalized Dirichlet Multinomial Regression (`tomotopy.GDMRModel`)
|
29
|
+
* Hierarchical Dirichlet Process (`tomotopy.HDPModel`)
|
30
|
+
* Hierarchical LDA (`tomotopy.HLDAModel`)
|
31
|
+
* Multi Grain LDA (`tomotopy.MGLDAModel`)
|
32
|
+
* Pachinko Allocation (`tomotopy.PAModel`)
|
33
|
+
* Hierarchical PA (`tomotopy.HPAModel`)
|
34
|
+
* Correlated Topic Model (`tomotopy.CTModel`)
|
35
|
+
* Dynamic Topic Model (`tomotopy.DTModel`).
|
36
|
+
|
37
|
+
Please visit https://bab2min.github.io/tomotopy to see more information.
|
38
|
+
|
39
|
+
The most recent version of tomotopy is 0.9.1.
|
40
|
+
|
41
|
+
Getting Started
|
42
|
+
---------------
|
43
|
+
You can install tomotopy easily using pip. (https://pypi.org/project/tomotopy/)
|
44
|
+
::
|
45
|
+
|
46
|
+
$ pip install --upgrade pip
|
47
|
+
$ pip install tomotopy
|
48
|
+
|
49
|
+
The supported OS and Python versions are:
|
50
|
+
|
51
|
+
* Linux (x86-64) with Python >= 3.5
|
52
|
+
* macOS >= 10.13 with Python >= 3.5
|
53
|
+
* Windows 7 or later (x86, x86-64) with Python >= 3.5
|
54
|
+
* Other OS with Python >= 3.5: Compilation from source code required (with c++11 compatible compiler)
|
55
|
+
|
56
|
+
After installing, you can start tomotopy by just importing.
|
57
|
+
::
|
58
|
+
|
59
|
+
import tomotopy as tp
|
60
|
+
print(tp.isa) # prints 'avx2', 'avx', 'sse2' or 'none'
|
61
|
+
|
62
|
+
Currently, tomotopy can exploits AVX2, AVX or SSE2 SIMD instruction set for maximizing performance.
|
63
|
+
When the package is imported, it will check available instruction sets and select the best option.
|
64
|
+
If `tp.isa` tells `none`, iterations of training may take a long time.
|
65
|
+
But, since most of modern Intel or AMD CPUs provide SIMD instruction set, the SIMD acceleration could show a big improvement.
|
66
|
+
|
67
|
+
Here is a sample code for simple LDA training of texts from 'sample.txt' file.
|
68
|
+
::
|
69
|
+
|
70
|
+
import tomotopy as tp
|
71
|
+
mdl = tp.LDAModel(k=20)
|
72
|
+
for line in open('sample.txt'):
|
73
|
+
mdl.add_doc(line.strip().split())
|
74
|
+
|
75
|
+
for i in range(0, 100, 10):
|
76
|
+
mdl.train(10)
|
77
|
+
print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
|
78
|
+
|
79
|
+
for k in range(mdl.k):
|
80
|
+
print('Top 10 words of topic #{}'.format(k))
|
81
|
+
print(mdl.get_topic_words(k, top_n=10))
|
82
|
+
|
83
|
+
mdl.summary()
|
84
|
+
|
85
|
+
Performance of tomotopy
|
86
|
+
-----------------------
|
87
|
+
`tomotopy` uses Collapsed Gibbs-Sampling(CGS) to infer the distribution of topics and the distribution of words.
|
88
|
+
Generally CGS converges more slowly than Variational Bayes(VB) that `gensim's LdaModel`_ uses, but its iteration can be computed much faster.
|
89
|
+
In addition, `tomotopy` can take advantage of multicore CPUs with a SIMD instruction set, which can result in faster iterations.
|
90
|
+
|
91
|
+
.. _gensim's LdaModel: https://radimrehurek.com/gensim/models/ldamodel.html
|
92
|
+
|
93
|
+
Following chart shows the comparison of LDA model's running time between `tomotopy` and `gensim`.
|
94
|
+
The input data consists of 1000 random documents from English Wikipedia with 1,506,966 words (about 10.1 MB).
|
95
|
+
`tomotopy` trains 200 iterations and `gensim` trains 10 iterations.
|
96
|
+
|
97
|
+
.. image:: https://bab2min.github.io/tomotopy/images/tmt_i5.png
|
98
|
+
|
99
|
+
Performance in Intel i5-6600, x86-64 (4 cores)
|
100
|
+
|
101
|
+
.. image:: https://bab2min.github.io/tomotopy/images/tmt_xeon.png
|
102
|
+
|
103
|
+
Performance in Intel Xeon E5-2620 v4, x86-64 (8 cores, 16 threads)
|
104
|
+
|
105
|
+
Although `tomotopy` iterated 20 times more, the overall running time was 5~10 times faster than `gensim`. And it yields a stable result.
|
106
|
+
|
107
|
+
It is difficult to compare CGS and VB directly because they are totaly different techniques.
|
108
|
+
But from a practical point of view, we can compare the speed and the result between them.
|
109
|
+
The following chart shows the log-likelihood per word of two models' result.
|
110
|
+
|
111
|
+
.. image:: https://bab2min.github.io/tomotopy/images/LLComp.png
|
112
|
+
|
113
|
+
The SIMD instruction set has a great effect on performance. Following is a comparison between SIMD instruction sets.
|
114
|
+
|
115
|
+
.. image:: https://bab2min.github.io/tomotopy/images/SIMDComp.png
|
116
|
+
|
117
|
+
Fortunately, most of recent x86-64 CPUs provide AVX2 instruction set, so we can enjoy the performance of AVX2.
|
118
|
+
|
119
|
+
Model Save and Load
|
120
|
+
-------------------
|
121
|
+
`tomotopy` provides `save` and `load` method for each topic model class,
|
122
|
+
so you can save the model into the file whenever you want, and re-load it from the file.
|
123
|
+
::
|
124
|
+
|
125
|
+
import tomotopy as tp
|
126
|
+
|
127
|
+
mdl = tp.HDPModel()
|
128
|
+
for line in open('sample.txt'):
|
129
|
+
mdl.add_doc(line.strip().split())
|
130
|
+
|
131
|
+
for i in range(0, 100, 10):
|
132
|
+
mdl.train(10)
|
133
|
+
print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
|
134
|
+
|
135
|
+
# save into file
|
136
|
+
mdl.save('sample_hdp_model.bin')
|
137
|
+
|
138
|
+
# load from file
|
139
|
+
mdl = tp.HDPModel.load('sample_hdp_model.bin')
|
140
|
+
for k in range(mdl.k):
|
141
|
+
if not mdl.is_live_topic(k): continue
|
142
|
+
print('Top 10 words of topic #{}'.format(k))
|
143
|
+
print(mdl.get_topic_words(k, top_n=10))
|
144
|
+
|
145
|
+
# the saved model is HDP model,
|
146
|
+
# so when you load it by LDA model, it will raise an exception
|
147
|
+
mdl = tp.LDAModel.load('sample_hdp_model.bin')
|
148
|
+
|
149
|
+
When you load the model from a file, a model type in the file should match the class of methods.
|
150
|
+
|
151
|
+
See more at `tomotopy.LDAModel.save` and `tomotopy.LDAModel.load` methods.
|
152
|
+
|
153
|
+
Documents in the Model and out of the Model
|
154
|
+
-------------------------------------------
|
155
|
+
We can use Topic Model for two major purposes.
|
156
|
+
The basic one is to discover topics from a set of documents as a result of trained model,
|
157
|
+
and the more advanced one is to infer topic distributions for unseen documents by using trained model.
|
158
|
+
|
159
|
+
We named the document in the former purpose (used for model training) as **document in the model**,
|
160
|
+
and the document in the later purpose (unseen document during training) as **document out of the model**.
|
161
|
+
|
162
|
+
In `tomotopy`, these two different kinds of document are generated differently.
|
163
|
+
A **document in the model** can be created by `tomotopy.LDAModel.add_doc` method.
|
164
|
+
`add_doc` can be called before `tomotopy.LDAModel.train` starts.
|
165
|
+
In other words, after `train` called, `add_doc` cannot add a document into the model because the set of document used for training has become fixed.
|
166
|
+
|
167
|
+
To acquire the instance of the created document, you should use `tomotopy.LDAModel.docs` like:
|
168
|
+
|
169
|
+
::
|
170
|
+
|
171
|
+
mdl = tp.LDAModel(k=20)
|
172
|
+
idx = mdl.add_doc(words)
|
173
|
+
if idx < 0: raise RuntimeError("Failed to add doc")
|
174
|
+
doc_inst = mdl.docs[idx]
|
175
|
+
# doc_inst is an instance of the added document
|
176
|
+
|
177
|
+
A **document out of the model** is generated by `tomotopy.LDAModel.make_doc` method. `make_doc` can be called only after `train` starts.
|
178
|
+
If you use `make_doc` before the set of document used for training has become fixed, you may get wrong results.
|
179
|
+
Since `make_doc` returns the instance directly, you can use its return value for other manipulations.
|
180
|
+
|
181
|
+
::
|
182
|
+
|
183
|
+
mdl = tp.LDAModel(k=20)
|
184
|
+
# add_doc ...
|
185
|
+
mdl.train(100)
|
186
|
+
doc_inst = mdl.make_doc(unseen_doc) # doc_inst is an instance of the unseen document
|
187
|
+
|
188
|
+
Inference for Unseen Documents
|
189
|
+
------------------------------
|
190
|
+
If a new document is created by `tomotopy.LDAModel.make_doc`, its topic distribution can be inferred by the model.
|
191
|
+
Inference for unseen document should be performed using `tomotopy.LDAModel.infer` method.
|
192
|
+
|
193
|
+
::
|
194
|
+
|
195
|
+
mdl = tp.LDAModel(k=20)
|
196
|
+
# add_doc ...
|
197
|
+
mdl.train(100)
|
198
|
+
doc_inst = mdl.make_doc(unseen_doc)
|
199
|
+
topic_dist, ll = mdl.infer(doc_inst)
|
200
|
+
print("Topic Distribution for Unseen Docs: ", topic_dist)
|
201
|
+
print("Log-likelihood of inference: ", ll)
|
202
|
+
|
203
|
+
The `infer` method can infer only one instance of `tomotopy.Document` or a `list` of instances of `tomotopy.Document`.
|
204
|
+
See more at `tomotopy.LDAModel.infer`.
|
205
|
+
|
206
|
+
Parallel Sampling Algorithms
|
207
|
+
----------------------------
|
208
|
+
Since version 0.5.0, `tomotopy` allows you to choose a parallelism algorithm.
|
209
|
+
The algorithm provided in versions prior to 0.4.2 is `COPY_MERGE`, which is provided for all topic models.
|
210
|
+
The new algorithm `PARTITION`, available since 0.5.0, makes training generally faster and more memory-efficient, but it is available at not all topic models.
|
211
|
+
|
212
|
+
The following chart shows the speed difference between the two algorithms based on the number of topics and the number of workers.
|
213
|
+
|
214
|
+
.. image:: https://bab2min.github.io/tomotopy/images/algo_comp.png
|
215
|
+
|
216
|
+
.. image:: https://bab2min.github.io/tomotopy/images/algo_comp2.png
|
217
|
+
|
218
|
+
Performance by Version
|
219
|
+
----------------------
|
220
|
+
Performance changes by version are shown in the following graph.
|
221
|
+
The time it takes to run the LDA model train with 1000 iteration was measured.
|
222
|
+
(Docs: 11314, Vocab: 60382, Words: 2364724, Intel Xeon Gold 5120 @2.2GHz)
|
223
|
+
|
224
|
+
.. image:: https://bab2min.github.io/tomotopy/images/lda-perf-t1.png
|
225
|
+
|
226
|
+
.. image:: https://bab2min.github.io/tomotopy/images/lda-perf-t4.png
|
227
|
+
|
228
|
+
.. image:: https://bab2min.github.io/tomotopy/images/lda-perf-t8.png
|
229
|
+
|
230
|
+
Pining Topics using Word Priors
|
231
|
+
-------------------------------
|
232
|
+
Since version 0.6.0, a new method `tomotopy.LDAModel.set_word_prior` has been added. It allows you to control word prior for each topic.
|
233
|
+
For example, we can set the weight of the word 'church' to 1.0 in topic 0, and the weight to 0.1 in the rest of the topics by following codes.
|
234
|
+
This means that the probability that the word 'church' is assigned to topic 0 is 10 times higher than the probability of being assigned to another topic.
|
235
|
+
Therefore, most of 'church' is assigned to topic 0, so topic 0 contains many words related to 'church'.
|
236
|
+
This allows to manipulate some topics to be placed at a specific topic number.
|
237
|
+
|
238
|
+
::
|
239
|
+
|
240
|
+
import tomotopy as tp
|
241
|
+
mdl = tp.LDAModel(k=20)
|
242
|
+
|
243
|
+
# add documents into `mdl`
|
244
|
+
|
245
|
+
# setting word prior
|
246
|
+
mdl.set_word_prior('church', [1.0 if k == 0 else 0.1 for k in range(20)])
|
247
|
+
|
248
|
+
See `word_prior_example` in `example.py` for more details.
|
249
|
+
|
250
|
+
|
251
|
+
Examples
|
252
|
+
--------
|
253
|
+
You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/master/examples/ .
|
254
|
+
|
255
|
+
You can also get the data file used in the example code at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view .
|
256
|
+
|
257
|
+
License
|
258
|
+
---------
|
259
|
+
`tomotopy` is licensed under the terms of MIT License,
|
260
|
+
meaning you can use it for any reasonable purpose and remain in complete ownership of all the documentation you produce.
|
261
|
+
|
262
|
+
History
|
263
|
+
-------
|
264
|
+
* 0.9.1 (2020-08-08)
|
265
|
+
* Memory leaks of version 0.9.0 was fixed.
|
266
|
+
* `tomotopy.CTModel.summary()` was fixed.
|
267
|
+
|
268
|
+
* 0.9.0 (2020-08-04)
|
269
|
+
* The `tomotopy.LDAModel.summary()` method, which prints human-readable summary of the model, has been added.
|
270
|
+
* The random number generator of package has been replaced with `EigenRand`_. It speeds up the random number generation and solves the result difference between platforms.
|
271
|
+
* Due to above, even if `seed` is the same, the model training result may be different from the version before 0.9.0.
|
272
|
+
* Fixed a training error in `tomotopy.HDPModel`.
|
273
|
+
* `tomotopy.DMRModel.alpha` now shows Dirichlet prior of per-document topic distribution by metadata.
|
274
|
+
* `tomotopy.DTModel.get_count_by_topics()` has been modified to return a 2-dimensional `ndarray`.
|
275
|
+
* `tomotopy.DTModel.alpha` has been modified to return the same value as `tomotopy.DTModel.get_alpha()`.
|
276
|
+
* Fixed an issue where the `metadata` value could not be obtained for the document of `tomotopy.GDMRModel`.
|
277
|
+
* `tomotopy.HLDAModel.alpha` now shows Dirichlet prior of per-document depth distribution.
|
278
|
+
* `tomotopy.LDAModel.global_step` has been added.
|
279
|
+
* `tomotopy.MGLDAModel.get_count_by_topics()` now returns the word count for both global and local topics.
|
280
|
+
* `tomotopy.PAModel.alpha`, `tomotopy.PAModel.subalpha`, and `tomotopy.PAModel.get_count_by_super_topic()` have been added.
|
281
|
+
|
282
|
+
.. _EigenRand: https://github.com/bab2min/EigenRand
|
283
|
+
|
284
|
+
* 0.8.2 (2020-07-14)
|
285
|
+
* New properties `tomotopy.DTModel.num_timepoints` and `tomotopy.DTModel.num_docs_by_timepoint` have been added.
|
286
|
+
* A bug which causes different results with the different platform even if `seeds` were the same was partially fixed.
|
287
|
+
As a result of this fix, now `tomotopy` in 32 bit yields different training results from earlier version.
|
288
|
+
|
289
|
+
* 0.8.1 (2020-06-08)
|
290
|
+
* A bug where `tomotopy.LDAModel.used_vocabs` returned an incorrect value was fixed.
|
291
|
+
* Now `tomotopy.CTModel.prior_cov` returns a covariance matrix with shape `[k, k]`.
|
292
|
+
* Now `tomotopy.CTModel.get_correlations` with empty arguments returns a correlation matrix with shape `[k, k]`.
|
293
|
+
|
294
|
+
* 0.8.0 (2020-06-06)
|
295
|
+
* Since NumPy was introduced in tomotopy, many methods and properties of tomotopy return not just `list`, but `numpy.ndarray` now.
|
296
|
+
* Tomotopy has a new dependency `NumPy >= 1.10.0`.
|
297
|
+
* A wrong estimation of `tomotopy.HDPModel.infer` was fixed.
|
298
|
+
* A new method about converting HDPModel to LDAModel was added.
|
299
|
+
* New properties including `tomotopy.LDAModel.used_vocabs`, `tomotopy.LDAModel.used_vocab_freq` and `tomotopy.LDAModel.used_vocab_df` were added into topic models.
|
300
|
+
* A new g-DMR topic model(`tomotopy.GDMRModel`) was added.
|
301
|
+
* An error at initializing `tomotopy.label.FoRelevance` in macOS was fixed.
|
302
|
+
* An error that occured when using `tomotopy.utils.Corpus` created without `raw` parameters was fixed.
|
303
|
+
|
304
|
+
* 0.7.1 (2020-05-08)
|
305
|
+
* `tomotopy.Document.path` was added for `tomotopy.HLDAModel`.
|
306
|
+
* A memory corruption bug in `tomotopy.label.PMIExtractor` was fixed.
|
307
|
+
* A compile error in gcc 7 was fixed.
|
308
|
+
|
309
|
+
* 0.7.0 (2020-04-18)
|
310
|
+
* `tomotopy.DTModel` was added into the package.
|
311
|
+
* A bug in `tomotopy.utils.Corpus.save` was fixed.
|
312
|
+
* A new method `tomotopy.Document.get_count_vector` was added into Document class.
|
313
|
+
* Now linux distributions use manylinux2010 and an additional optimization is applied.
|
314
|
+
|
315
|
+
* 0.6.2 (2020-03-28)
|
316
|
+
* A critical bug related to `save` and `load` was fixed. Version 0.6.0 and 0.6.1 have been removed from releases.
|
317
|
+
|
318
|
+
* 0.6.1 (2020-03-22) (removed)
|
319
|
+
* A bug related to module loading was fixed.
|
320
|
+
|
321
|
+
* 0.6.0 (2020-03-22) (removed)
|
322
|
+
* `tomotopy.utils.Corpus` class that manages multiple documents easily was added.
|
323
|
+
* `tomotopy.LDAModel.set_word_prior` method that controls word-topic priors of topic models was added.
|
324
|
+
* A new argument `min_df` that filters words based on document frequency was added into every topic model's __init__.
|
325
|
+
* `tomotopy.label`, the submodule about topic labeling was added. Currently, only `tomotopy.label.FoRelevance` is provided.
|
326
|
+
|
327
|
+
* 0.5.2 (2020-03-01)
|
328
|
+
* A segmentation fault problem was fixed in `tomotopy.LLDAModel.add_doc`.
|
329
|
+
* A bug was fixed that `infer` of `tomotopy.HDPModel` sometimes crashes the program.
|
330
|
+
* A crash issue was fixed of `tomotopy.LDAModel.infer` with ps=tomotopy.ParallelScheme.PARTITION, together=True.
|
331
|
+
|
332
|
+
* 0.5.1 (2020-01-11)
|
333
|
+
* A bug was fixed that `tomotopy.SLDAModel.make_doc` doesn't support missing values for `y`.
|
334
|
+
* Now `tomotopy.SLDAModel` fully supports missing values for response variables `y`. Documents with missing values (NaN) are included in modeling topic, but excluded from regression of response variables.
|
335
|
+
|
336
|
+
* 0.5.0 (2019-12-30)
|
337
|
+
* Now `tomotopy.PAModel.infer` returns both topic distribution nd sub-topic distribution.
|
338
|
+
* New methods get_sub_topics and get_sub_topic_dist were added into `tomotopy.Document`. (for PAModel)
|
339
|
+
* New parameter `parallel` was added for `tomotopy.LDAModel.train` and `tomotopy.LDAModel.infer` method. You can select parallelism algorithm by changing this parameter.
|
340
|
+
* `tomotopy.ParallelScheme.PARTITION`, a new algorithm, was added. It works efficiently when the number of workers is large, the number of topics or the size of vocabulary is big.
|
341
|
+
* A bug where `rm_top` didn't work at `min_cf` < 2 was fixed.
|
342
|
+
|
343
|
+
* 0.4.2 (2019-11-30)
|
344
|
+
* Wrong topic assignments of `tomotopy.LLDAModel` and `tomotopy.PLDAModel` were fixed.
|
345
|
+
* Readable __repr__ of `tomotopy.Document` and `tomotopy.Dictionary` was implemented.
|
346
|
+
|
347
|
+
* 0.4.1 (2019-11-27)
|
348
|
+
* A bug at init function of `tomotopy.PLDAModel` was fixed.
|
349
|
+
|
350
|
+
* 0.4.0 (2019-11-18)
|
351
|
+
* New models including `tomotopy.PLDAModel` and `tomotopy.HLDAModel` were added into the package.
|
352
|
+
|
353
|
+
* 0.3.1 (2019-11-05)
|
354
|
+
* An issue where `get_topic_dist()` returns incorrect value when `min_cf` or `rm_top` is set was fixed.
|
355
|
+
* The return value of `get_topic_dist()` of `tomotopy.MGLDAModel` document was fixed to include local topics.
|
356
|
+
* The estimation speed with `tw=ONE` was improved.
|
357
|
+
|
358
|
+
* 0.3.0 (2019-10-06)
|
359
|
+
* A new model, `tomotopy.LLDAModel` was added into the package.
|
360
|
+
* A crashing issue of `HDPModel` was fixed.
|
361
|
+
* Since hyperparameter estimation for `HDPModel` was implemented, the result of `HDPModel` may differ from previous versions.
|
362
|
+
If you want to turn off hyperparameter estimation of HDPModel, set `optim_interval` to zero.
|
363
|
+
|
364
|
+
* 0.2.0 (2019-08-18)
|
365
|
+
* New models including `tomotopy.CTModel` and `tomotopy.SLDAModel` were added into the package.
|
366
|
+
* A new parameter option `rm_top` was added for all topic models.
|
367
|
+
* The problems in `save` and `load` method for `PAModel` and `HPAModel` were fixed.
|
368
|
+
* An occassional crash in loading `HDPModel` was fixed.
|
369
|
+
* The problem that `ll_per_word` was calculated incorrectly when `min_cf` > 0 was fixed.
|
370
|
+
|
371
|
+
* 0.1.6 (2019-08-09)
|
372
|
+
* Compiling errors at clang with macOS environment were fixed.
|
373
|
+
|
374
|
+
* 0.1.4 (2019-08-05)
|
375
|
+
* The issue when `add_doc` receives an empty list as input was fixed.
|
376
|
+
* The issue that `tomotopy.PAModel.get_topic_words` doesn't extract the word distribution of subtopic was fixed.
|
377
|
+
|
378
|
+
* 0.1.3 (2019-05-19)
|
379
|
+
* The parameter `min_cf` and its stopword-removing function were added for all topic models.
|
380
|
+
|
381
|
+
* 0.1.0 (2019-05-12)
|
382
|
+
* First version of **tomotopy**
|