tomoto 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/tomoto/ct.cpp +11 -11
- data/ext/tomoto/dmr.cpp +14 -13
- data/ext/tomoto/dt.cpp +14 -14
- data/ext/tomoto/ext.cpp +7 -7
- data/ext/tomoto/extconf.rb +1 -3
- data/ext/tomoto/gdmr.cpp +7 -7
- data/ext/tomoto/hdp.cpp +9 -9
- data/ext/tomoto/hlda.cpp +13 -13
- data/ext/tomoto/hpa.cpp +5 -5
- data/ext/tomoto/lda.cpp +42 -39
- data/ext/tomoto/llda.cpp +6 -6
- data/ext/tomoto/mglda.cpp +15 -15
- data/ext/tomoto/pa.cpp +6 -6
- data/ext/tomoto/plda.cpp +6 -6
- data/ext/tomoto/slda.cpp +8 -8
- data/ext/tomoto/utils.h +16 -70
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/README.kr.rst +57 -0
- data/vendor/tomotopy/README.rst +55 -0
- data/vendor/tomotopy/src/Labeling/Phraser.hpp +3 -3
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +4 -4
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +2 -2
- data/vendor/tomotopy/src/TopicModel/LDA.h +3 -3
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +3 -3
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +34 -14
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +2 -2
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +1 -1
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +4 -1
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +48 -21
- data/vendor/tomotopy/src/Utils/AliasMethod.hpp +5 -4
- data/vendor/tomotopy/src/Utils/Dictionary.h +2 -2
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -1
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +1 -1
- data/vendor/tomotopy/src/Utils/math.h +2 -2
- data/vendor/tomotopy/src/Utils/serializer.hpp +30 -5
- metadata +6 -6
    
        data/ext/tomoto/plda.cpp
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            #include <PLDA.h>
         | 
| 2 2 |  | 
| 3 | 
            -
            #include <rice/ | 
| 3 | 
            +
            #include <rice/rice.hpp>
         | 
| 4 4 |  | 
| 5 5 | 
             
            #include "utils.h"
         | 
| 6 6 |  | 
| 7 7 | 
             
            void init_plda(Rice::Module& m) {
         | 
| 8 8 | 
             
              Rice::define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(m, "PLDA")
         | 
| 9 | 
            -
                . | 
| 9 | 
            +
                .define_singleton_function(
         | 
| 10 10 | 
             
                  "_new",
         | 
| 11 | 
            -
                   | 
| 11 | 
            +
                  [](size_t tw, size_t latent_topics, tomoto::Float alpha, tomoto::Float eta, size_t seed) {
         | 
| 12 12 | 
             
                    tomoto::PLDAArgs args;
         | 
| 13 13 | 
             
                    args.numLatentTopics = latent_topics;
         | 
| 14 14 | 
             
                    args.alpha = {alpha};
         | 
| @@ -17,17 +17,17 @@ void init_plda(Rice::Module& m) { | |
| 17 17 | 
             
                      args.seed = seed;
         | 
| 18 18 | 
             
                    }
         | 
| 19 19 | 
             
                    return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, args);
         | 
| 20 | 
            -
                  })
         | 
| 20 | 
            +
                  }, Rice::Return().takeOwnership())
         | 
| 21 21 | 
             
                .define_method(
         | 
| 22 22 | 
             
                  "_add_doc",
         | 
| 23 | 
            -
                   | 
| 23 | 
            +
                  [](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
         | 
| 24 24 | 
             
                    auto doc = buildDoc(words);
         | 
| 25 25 | 
             
                    doc.misc["labels"] = labels;
         | 
| 26 26 | 
             
                    return self.addDoc(doc);
         | 
| 27 27 | 
             
                  })
         | 
| 28 28 | 
             
                .define_method(
         | 
| 29 29 | 
             
                  "latent_topics",
         | 
| 30 | 
            -
                   | 
| 30 | 
            +
                  [](tomoto::IPLDAModel& self) {
         | 
| 31 31 | 
             
                    return self.getNumLatentTopics();
         | 
| 32 32 | 
             
                  });
         | 
| 33 33 | 
             
            }
         | 
    
        data/ext/tomoto/slda.cpp
    CHANGED
    
    | @@ -1,18 +1,18 @@ | |
| 1 1 | 
             
            #include <SLDA.h>
         | 
| 2 2 |  | 
| 3 | 
            -
            #include <rice/ | 
| 3 | 
            +
            #include <rice/rice.hpp>
         | 
| 4 4 |  | 
| 5 5 | 
             
            #include "utils.h"
         | 
| 6 6 |  | 
| 7 7 | 
             
            void init_slda(Rice::Module& m) {
         | 
| 8 8 | 
             
              Rice::define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(m, "SLDA")
         | 
| 9 | 
            -
                . | 
| 9 | 
            +
                .define_singleton_function(
         | 
| 10 10 | 
             
                  "_new",
         | 
| 11 | 
            -
                   | 
| 11 | 
            +
                  [](size_t tw, size_t k, Array rb_vars, tomoto::Float alpha, tomoto::Float eta, std::vector<tomoto::Float> mu, std::vector<tomoto::Float> nu_sq, std::vector<tomoto::Float> glm_param, size_t seed) {
         | 
| 12 12 | 
             
                    std::vector<tomoto::ISLDAModel::GLM> vars;
         | 
| 13 13 | 
             
                    vars.reserve(rb_vars.size());
         | 
| 14 14 | 
             
                    for (auto const& v : rb_vars) {
         | 
| 15 | 
            -
                      vars.push_back((tomoto::ISLDAModel::GLM)  | 
| 15 | 
            +
                      vars.push_back((tomoto::ISLDAModel::GLM) Rice::detail::From_Ruby<int>().convert(v.value()));
         | 
| 16 16 | 
             
                    }
         | 
| 17 17 | 
             
                    tomoto::SLDAArgs args;
         | 
| 18 18 | 
             
                    args.k = k;
         | 
| @@ -26,22 +26,22 @@ void init_slda(Rice::Module& m) { | |
| 26 26 | 
             
                      args.seed = seed;
         | 
| 27 27 | 
             
                    }
         | 
| 28 28 | 
             
                    return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, args);
         | 
| 29 | 
            -
                  })
         | 
| 29 | 
            +
                  }, Rice::Return().takeOwnership())
         | 
| 30 30 | 
             
                .define_method(
         | 
| 31 31 | 
             
                  "_add_doc",
         | 
| 32 | 
            -
                   | 
| 32 | 
            +
                  [](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
         | 
| 33 33 | 
             
                    auto doc = buildDoc(words);
         | 
| 34 34 | 
             
                    doc.misc["y"] = y;
         | 
| 35 35 | 
             
                    return self.addDoc(doc);
         | 
| 36 36 | 
             
                  })
         | 
| 37 37 | 
             
                .define_method(
         | 
| 38 38 | 
             
                  "f",
         | 
| 39 | 
            -
                   | 
| 39 | 
            +
                  [](tomoto::ISLDAModel& self) {
         | 
| 40 40 | 
             
                    return self.getF();
         | 
| 41 41 | 
             
                  })
         | 
| 42 42 | 
             
                .define_method(
         | 
| 43 43 | 
             
                  "_var_type",
         | 
| 44 | 
            -
                   | 
| 44 | 
            +
                  [](tomoto::ISLDAModel& self, size_t var_id) {
         | 
| 45 45 | 
             
                    if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
         | 
| 46 46 | 
             
                    return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
         | 
| 47 47 | 
             
                  });
         | 
    
        data/ext/tomoto/utils.h
    CHANGED
    
    | @@ -1,80 +1,26 @@ | |
| 1 1 | 
             
            #pragma once
         | 
| 2 2 |  | 
| 3 | 
            -
            #include <rice/ | 
| 3 | 
            +
            #include <rice/rice.hpp>
         | 
| 4 | 
            +
            #include <rice/stl.hpp>
         | 
| 4 5 |  | 
| 5 6 | 
             
            using Rice::Array;
         | 
| 6 7 | 
             
            using Rice::Object;
         | 
| 7 8 |  | 
| 8 | 
            -
             | 
| 9 | 
            -
            inline
         | 
| 10 | 
            -
            Object to_ruby<std::vector<tomoto::Float>>(std::vector<tomoto::Float> const & x)
         | 
| 9 | 
            +
            namespace Rice::detail
         | 
| 11 10 | 
             
            {
         | 
| 12 | 
            -
               | 
| 13 | 
            -
               | 
| 14 | 
            -
             | 
| 15 | 
            -
               | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
               | 
| 25 | 
            -
                res.push(v);
         | 
| 26 | 
            -
              }
         | 
| 27 | 
            -
              return res;
         | 
| 28 | 
            -
            }
         | 
| 29 | 
            -
             | 
| 30 | 
            -
            template<>
         | 
| 31 | 
            -
            inline
         | 
| 32 | 
            -
            Object to_ruby<std::vector<uint64_t>>(std::vector<uint64_t> const & x)
         | 
| 33 | 
            -
            {
         | 
| 34 | 
            -
              Array res;
         | 
| 35 | 
            -
              for (auto const& v : x) {
         | 
| 36 | 
            -
                res.push(v);
         | 
| 37 | 
            -
              }
         | 
| 38 | 
            -
              return res;
         | 
| 39 | 
            -
            }
         | 
| 40 | 
            -
             | 
| 41 | 
            -
            template<>
         | 
| 42 | 
            -
            inline
         | 
| 43 | 
            -
            std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
         | 
| 44 | 
            -
            {
         | 
| 45 | 
            -
              Array a = Array(x);
         | 
| 46 | 
            -
              std::vector<std::string> res;
         | 
| 47 | 
            -
              res.reserve(a.size());
         | 
| 48 | 
            -
              for (auto const& v : a) {
         | 
| 49 | 
            -
                res.push_back(from_ruby<std::string>(v));
         | 
| 50 | 
            -
              }
         | 
| 51 | 
            -
              return res;
         | 
| 52 | 
            -
            }
         | 
| 53 | 
            -
             | 
| 54 | 
            -
            template<>
         | 
| 55 | 
            -
            inline
         | 
| 56 | 
            -
            std::vector<tomoto::Float> from_ruby<std::vector<tomoto::Float>>(Object x)
         | 
| 57 | 
            -
            {
         | 
| 58 | 
            -
              Array a = Array(x);
         | 
| 59 | 
            -
              std::vector<tomoto::Float> res;
         | 
| 60 | 
            -
              res.reserve(a.size());
         | 
| 61 | 
            -
              for (auto const& v : a) {
         | 
| 62 | 
            -
                res.push_back(from_ruby<tomoto::Float>(v));
         | 
| 63 | 
            -
              }
         | 
| 64 | 
            -
              return res;
         | 
| 65 | 
            -
            }
         | 
| 66 | 
            -
             | 
| 67 | 
            -
            template<>
         | 
| 68 | 
            -
            inline
         | 
| 69 | 
            -
            std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
         | 
| 70 | 
            -
            {
         | 
| 71 | 
            -
              Array a = Array(x);
         | 
| 72 | 
            -
              std::vector<uint64_t> res;
         | 
| 73 | 
            -
              res.reserve(a.size());
         | 
| 74 | 
            -
              for (auto const& v : a) {
         | 
| 75 | 
            -
                res.push_back(from_ruby<uint64_t>(v));
         | 
| 76 | 
            -
              }
         | 
| 77 | 
            -
              return res;
         | 
| 11 | 
            +
              template<typename T>
         | 
| 12 | 
            +
              class To_Ruby<std::vector<T>>
         | 
| 13 | 
            +
              {
         | 
| 14 | 
            +
              public:
         | 
| 15 | 
            +
                VALUE convert(std::vector<T> const & x)
         | 
| 16 | 
            +
                {
         | 
| 17 | 
            +
                  auto a = rb_ary_new2(x.size());
         | 
| 18 | 
            +
                  for (const auto& v : x) {
         | 
| 19 | 
            +
                    detail::protect(rb_ary_push, a, To_Ruby<T>().convert(v));
         | 
| 20 | 
            +
                  }
         | 
| 21 | 
            +
                  return a;
         | 
| 22 | 
            +
                }
         | 
| 23 | 
            +
              };
         | 
| 78 24 | 
             
            }
         | 
| 79 25 |  | 
| 80 26 | 
             
            inline tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
         | 
    
        data/lib/tomoto/version.rb
    CHANGED
    
    
| @@ -198,6 +198,57 @@ add_doc은 `tomotopy.LDAModel.train`을 시작하기 전까지만 사용할 수 | |
| 198 198 | 
             
            infer 메소드는 `tomotopy.Document` 인스턴스 하나를 추론하거나 `tomotopy.Document` 인스턴스의 `list`를 추론하는데 사용할 수 있습니다. 
         | 
| 199 199 | 
             
            자세한 것은 `tomotopy.LDAModel.infer`을 참조하길 바랍니다.
         | 
| 200 200 |  | 
| 201 | 
            +
            Corpus와 transform
         | 
| 202 | 
            +
            --------------------
         | 
| 203 | 
            +
            `tomotopy`의 모든 토픽 모델들은 각자 별도의 내부적인 문헌 타입을 가지고 있습니다.
         | 
| 204 | 
            +
            그리고 이 문헌 타입들에 맞는 문헌들은 각 모델의 `add_doc` 메소드를 통해 생성될 수 있습니다.
         | 
| 205 | 
            +
            하지만 이 때문에 동일한 목록의 문헌들을 서로 다른 토픽 모델에 입력해야 하는 경우
         | 
| 206 | 
            +
            매 모델에 각 문헌을 추가할때마다 `add_doc`을 호출해야하기 때문에 비효율이 발생합니다.
         | 
| 207 | 
            +
            따라서 `tomotopy`에서는 여러 문헌을 묶어서 관리해주는 `tomotopy.utils.Corpus` 클래스를 제공합니다.
         | 
| 208 | 
            +
            토픽 모델 객체를 생성할때 `tomotopy.utils.Corpus`를 `__init__` 메소드의 `corpus` 인자로 넘겨줌으로써 
         | 
| 209 | 
            +
            어떤 모델에든 쉽게 문헌들을 삽입할 수 있게 해줍니다.
         | 
| 210 | 
            +
            `tomotopy.utils.Corpus`를 토픽 모델에 삽입하면 corpus 객체가 가지고 있는 문헌들 전부가 모델에 자동으로 삽입됩니다.
         | 
| 211 | 
            +
             | 
| 212 | 
            +
            그런데 일부 토픽 모델의 경우 문헌을 생성하기 위해 서로 다른 데이터를 요구합니다.
         | 
| 213 | 
            +
            예를 들어 `tomotopy.DMRModel`는 `metadata`라는 `str` 타입의 데이터를 요구하고, 
         | 
| 214 | 
            +
            `tomotopy.PLDAModel`는 `labels`라는 `List[str]` 타입의 데이터를 요구합니다. 
         | 
| 215 | 
            +
            그러나 `tomotopy.utils.Corpus`는 토픽 모델에 종속되지 않은 독립적인 문헌 데이터를 보관하기 때문에,
         | 
| 216 | 
            +
            corpus가 가지고 있는 문헌 데이터가 실제 토픽 모델이 요구하는 데이터와 일치하지 않을 가능성이 있습니다.
         | 
| 217 | 
            +
            이 경우 `transform`라는 인자를 통해 corpus 내의 데이터를 변형시켜 토픽 모델이 요구하는 실제 데이터와 일치시킬 수 있습니다.
         | 
| 218 | 
            +
            자세한 내용은 아래의 코드를 확인해주세요:
         | 
| 219 | 
            +
             | 
| 220 | 
            +
            ::
         | 
| 221 | 
            +
             | 
| 222 | 
            +
                from tomotopy import DMRModel
         | 
| 223 | 
            +
                from tomotopy.utils import Corpus
         | 
| 224 | 
            +
             | 
| 225 | 
            +
                corpus = Corpus()
         | 
| 226 | 
            +
                corpus.add_doc("a b c d e".split(), a_data=1)
         | 
| 227 | 
            +
                corpus.add_doc("e f g h i".split(), a_data=2)
         | 
| 228 | 
            +
                corpus.add_doc("i j k l m".split(), a_data=3)
         | 
| 229 | 
            +
             | 
| 230 | 
            +
                model = DMRModel(k=10)
         | 
| 231 | 
            +
                model.add_corpus(corpus) 
         | 
| 232 | 
            +
                # `corpus`에 있던 `a_data`는 사라지고
         | 
| 233 | 
            +
                # `DMRModel`이 요구하는 `metadata`에는 기본값인 빈 문자열이 채워집니다.
         | 
| 234 | 
            +
             | 
| 235 | 
            +
                assert model.docs[0].metadata == ''
         | 
| 236 | 
            +
                assert model.docs[1].metadata == ''
         | 
| 237 | 
            +
                assert model.docs[2].metadata == ''
         | 
| 238 | 
            +
             | 
| 239 | 
            +
                def transform_a_data_to_metadata(misc: dict):
         | 
| 240 | 
            +
                    return {'metadata': str(misc['a_data'])}
         | 
| 241 | 
            +
                # 이 함수는 `a_data`를 `metadata`로 변환합니다.
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                model = DMRModel(k=10)
         | 
| 244 | 
            +
                model.add_corpus(corpus, transform=transform_a_data_to_metadata)
         | 
| 245 | 
            +
                # 이제 `model`에는 기본값이 아닌 `metadata`가 입력됩니다. 이들은 `transform`에 의해 `a_data`로부터 생성됩니다.
         | 
| 246 | 
            +
             | 
| 247 | 
            +
                assert model.docs[0].metadata == '1'
         | 
| 248 | 
            +
                assert model.docs[1].metadata == '2'
         | 
| 249 | 
            +
                assert model.docs[2].metadata == '3'
         | 
| 250 | 
            +
             | 
| 251 | 
            +
             | 
| 201 252 | 
             
            병렬 샘플링 알고리즘
         | 
| 202 253 | 
             
            ----------------------------
         | 
| 203 254 | 
             
            `tomotopy`는 0.5.0버전부터 병렬 알고리즘을 고를 수 있는 선택지를 제공합니다.
         | 
| @@ -254,6 +305,12 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma | |
| 254 305 |  | 
| 255 306 | 
             
            역사
         | 
| 256 307 | 
             
            -------
         | 
| 308 | 
            +
            * 0.12.1 (2021-06-20)
         | 
| 309 | 
            +
                * `tomotopy.LDAModel.set_word_prior()`가 크래시를 발생시키던 문제를 해결했습니다.
         | 
| 310 | 
            +
                * 이제 `tomotopy.LDAModel.perplexity`와 `tomotopy.LDAModel.ll_per_word`가 TermWeight가 ONE이 아닌 경우에도 정확한 값을 반환합니다.
         | 
| 311 | 
            +
                * 용어가중치가 적용된 빈도수를 반환하는 `tomotopy.LDAModel.used_vocab_weighted_freq`가 추가되었습니다.
         | 
| 312 | 
            +
                * 이제 `tomotopy.LDAModel.summary()`가 단어의 엔트로피뿐만 아니라, 용어 가중치가 적용된 단어의 엔트로피도 함께 보여줍니다.
         | 
| 313 | 
            +
             | 
| 257 314 | 
             
            * 0.12.0 (2021-04-26)
         | 
| 258 315 | 
             
                * 이제 `tomotopy.DMRModel`와 `tomotopy.GDMRModel`가 다중 메타데이터를 지원합니다. (https://github.com/bab2min/tomotopy/blob/main/examples/dmr_multi_label.py 참조)
         | 
| 259 316 | 
             
                * `tomotopy.GDMRModel`의 성능이 개선되었습니다.
         | 
    
        data/vendor/tomotopy/README.rst
    CHANGED
    
    | @@ -202,6 +202,55 @@ Inference for unseen document should be performed using `tomotopy.LDAModel.infer | |
| 202 202 | 
             
            The `infer` method can infer only one instance of `tomotopy.Document` or a `list` of instances of `tomotopy.Document`. 
         | 
| 203 203 | 
             
            See more at `tomotopy.LDAModel.infer`.
         | 
| 204 204 |  | 
| 205 | 
            +
            Corpus and transform
         | 
| 206 | 
            +
            --------------------
         | 
| 207 | 
            +
            Every topic model in `tomotopy` has its own internal document type.
         | 
| 208 | 
            +
            A document can be created and added into suitable for each model through each model's `add_doc` method. 
         | 
| 209 | 
            +
            However, trying to add the same list of documents to different models becomes quite inconvenient, 
         | 
| 210 | 
            +
            because `add_doc` should be called for the same list of documents to each different model.
         | 
| 211 | 
            +
            Thus, `tomotopy` provides `tomotopy.utils.Corpus` class that holds a list of documents. 
         | 
| 212 | 
            +
            `tomotopy.utils.Corpus` can be inserted into any model by passing as argument `corpus` to `__init__` or `add_corpus` method of each model. 
         | 
| 213 | 
            +
            So, inserting `tomotopy.utils.Corpus` just has the same effect to inserting documents the corpus holds.
         | 
| 214 | 
            +
             | 
| 215 | 
            +
            Some topic models requires different data for its documents. 
         | 
| 216 | 
            +
            For example, `tomotopy.DMRModel` requires argument `metadata` in `str` type, 
         | 
| 217 | 
            +
            but `tomotopy.PLDAModel` requires argument `labels` in `List[str]` type. 
         | 
| 218 | 
            +
            Since `tomotopy.utils.Corpus` holds an independent set of documents rather than being tied to a specific topic model, 
         | 
| 219 | 
            +
            data types required by a topic model may be inconsistent when a corpus is added into that topic model. 
         | 
| 220 | 
            +
            In this case, miscellaneous data can be transformed to be fitted target topic model using argument `transform`. 
         | 
| 221 | 
            +
            See more details in the following code:
         | 
| 222 | 
            +
             | 
| 223 | 
            +
            ::
         | 
| 224 | 
            +
             | 
| 225 | 
            +
                from tomotopy import DMRModel
         | 
| 226 | 
            +
                from tomotopy.utils import Corpus
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                corpus = Corpus()
         | 
| 229 | 
            +
                corpus.add_doc("a b c d e".split(), a_data=1)
         | 
| 230 | 
            +
                corpus.add_doc("e f g h i".split(), a_data=2)
         | 
| 231 | 
            +
                corpus.add_doc("i j k l m".split(), a_data=3)
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                model = DMRModel(k=10)
         | 
| 234 | 
            +
                model.add_corpus(corpus) 
         | 
| 235 | 
            +
                # You lose `a_data` field in `corpus`, 
         | 
| 236 | 
            +
                # and `metadata` that `DMRModel` requires is filled with the default value, empty str.
         | 
| 237 | 
            +
             | 
| 238 | 
            +
                assert model.docs[0].metadata == ''
         | 
| 239 | 
            +
                assert model.docs[1].metadata == ''
         | 
| 240 | 
            +
                assert model.docs[2].metadata == ''
         | 
| 241 | 
            +
             | 
| 242 | 
            +
                def transform_a_data_to_metadata(misc: dict):
         | 
| 243 | 
            +
                    return {'metadata': str(misc['a_data'])}
         | 
| 244 | 
            +
                # this function transforms `a_data` to `metadata`
         | 
| 245 | 
            +
             | 
| 246 | 
            +
                model = DMRModel(k=10)
         | 
| 247 | 
            +
                model.add_corpus(corpus, transform=transform_a_data_to_metadata)
         | 
| 248 | 
            +
                # Now docs in `model` has non-default `metadata`, that generated from `a_data` field.
         | 
| 249 | 
            +
             | 
| 250 | 
            +
                assert model.docs[0].metadata == '1'
         | 
| 251 | 
            +
                assert model.docs[1].metadata == '2'
         | 
| 252 | 
            +
                assert model.docs[2].metadata == '3'
         | 
| 253 | 
            +
             | 
| 205 254 | 
             
            Parallel Sampling Algorithms
         | 
| 206 255 | 
             
            ----------------------------
         | 
| 207 256 | 
             
            Since version 0.5.0, `tomotopy` allows you to choose a parallelism algorithm. 
         | 
| @@ -260,6 +309,12 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh | |
| 260 309 |  | 
| 261 310 | 
             
            History
         | 
| 262 311 | 
             
            -------
         | 
| 312 | 
            +
            * 0.12.1 (2021-06-20)
         | 
| 313 | 
            +
                * An issue where `tomotopy.LDAModel.set_word_prior()` causes a crash has been fixed.
         | 
| 314 | 
            +
                * Now `tomotopy.LDAModel.perplexity` and `tomotopy.LDAModel.ll_per_word` return the accurate value when `TermWeight` is not `ONE`.
         | 
| 315 | 
            +
                * `tomotopy.LDAModel.used_vocab_weighted_freq` was added, which returns term-weighted frequencies of words.
         | 
| 316 | 
            +
                * Now `tomotopy.LDAModel.summary()` shows not only the entropy of words, but also the entropy of term-weighted words.
         | 
| 317 | 
            +
             | 
| 263 318 | 
             
            * 0.12.0 (2021-04-26)
         | 
| 264 319 | 
             
                * Now `tomotopy.DMRModel` and `tomotopy.GDMRModel` support multiple values of metadata (see https://github.com/bab2min/tomotopy/blob/main/examples/dmr_multi_label.py )
         | 
| 265 320 | 
             
                * The performance of `tomotopy.GDMRModel` was improved.
         | 
| @@ -316,7 +316,7 @@ namespace tomoto | |
| 316 316 | 
             
            				}
         | 
| 317 317 | 
             
            			}
         | 
| 318 318 |  | 
| 319 | 
            -
            			float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
         | 
| 319 | 
            +
            			float totN = (float)std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
         | 
| 320 320 | 
             
            			const float logTotN = std::log(totN);
         | 
| 321 321 |  | 
| 322 322 | 
             
            			// calculating PMIs
         | 
| @@ -489,7 +489,7 @@ namespace tomoto | |
| 489 489 |  | 
| 490 490 | 
             
            				float rbe = branchingEntropy(trieNodes[0].getNext(bigram.first)->getNext(bigram.second), candMinCnt);
         | 
| 491 491 | 
             
            				float lbe = branchingEntropy(trieNodesBw[0].getNext(bigram.second)->getNext(bigram.first), candMinCnt);
         | 
| 492 | 
            -
            				float nbe = std::sqrt(rbe * lbe) / std::log(p.second);
         | 
| 492 | 
            +
            				float nbe = std::sqrt(rbe * lbe) / (float)std::log(p.second);
         | 
| 493 493 | 
             
            				if (nbe < minNBE) continue;
         | 
| 494 494 | 
             
            				candidates.emplace_back(npmi * nbe, bigram.first, bigram.second);
         | 
| 495 495 | 
             
            				candidates.back().cf = p.second;
         | 
| @@ -512,7 +512,7 @@ namespace tomoto | |
| 512 512 |  | 
| 513 513 | 
             
            					float rbe = branchingEntropy(node, candMinCnt);
         | 
| 514 514 | 
             
            					float lbe = branchingEntropy(trieNodesBw[0].findNode(rkeys.rbegin(), rkeys.rend()), candMinCnt);
         | 
| 515 | 
            -
            					float nbe = std::sqrt(rbe * lbe) / std::log(node->val);
         | 
| 515 | 
            +
            					float nbe = std::sqrt(rbe * lbe) / (float)std::log(node->val);
         | 
| 516 516 | 
             
            					if (nbe < minNBE) return;
         | 
| 517 517 | 
             
            					candidates.emplace_back(npmi * nbe, rkeys);
         | 
| 518 518 | 
             
            					candidates.back().cf = node->val;
         | 
| @@ -33,7 +33,10 @@ namespace tomoto | |
| 33 33 | 
             
            		friend typename BaseClass::BaseClass;
         | 
| 34 34 | 
             
            		using WeightType = typename BaseClass::WeightType;
         | 
| 35 35 |  | 
| 36 | 
            -
            		static constexpr  | 
| 36 | 
            +
            		static constexpr auto tmid()
         | 
| 37 | 
            +
            		{
         | 
| 38 | 
            +
            			return serializer::to_key("CTM\0");
         | 
| 39 | 
            +
            		}
         | 
| 37 40 |  | 
| 38 41 | 
             
            		uint64_t numBetaSample = 10;
         | 
| 39 42 | 
             
            		uint64_t numTMNSample = 5;
         | 
| @@ -247,7 +250,7 @@ namespace tomoto | |
| 247 250 | 
             
            			this->optimInterval = 2;
         | 
| 248 251 | 
             
            		}
         | 
| 249 252 |  | 
| 250 | 
            -
            		std::vector<Float>  | 
| 253 | 
            +
            		std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
         | 
| 251 254 | 
             
            		{
         | 
| 252 255 | 
             
            			std::vector<Float> ret(this->K);
         | 
| 253 256 | 
             
            			Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
         | 
| @@ -47,7 +47,10 @@ namespace tomoto | |
| 47 47 | 
             
            		friend typename BaseClass::BaseClass;
         | 
| 48 48 | 
             
            		using WeightType = typename BaseClass::WeightType;
         | 
| 49 49 |  | 
| 50 | 
            -
            		static constexpr  | 
| 50 | 
            +
            		static constexpr auto tmid()
         | 
| 51 | 
            +
            		{
         | 
| 52 | 
            +
            			return serializer::to_key("DMR\0");
         | 
| 53 | 
            +
            		}
         | 
| 51 54 |  | 
| 52 55 | 
             
            		Matrix lambda;
         | 
| 53 56 | 
             
            		mutable std::unordered_map<std::pair<uint64_t, Vector>, size_t, MdHash> mdHashMap;
         | 
| @@ -449,7 +452,7 @@ namespace tomoto | |
| 449 452 | 
             
            			optimRepeat = _optimRepeat;
         | 
| 450 453 | 
             
            		}
         | 
| 451 454 |  | 
| 452 | 
            -
            		std::vector<Float>  | 
| 455 | 
            +
            		std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
         | 
| 453 456 | 
             
            		{
         | 
| 454 457 | 
             
            			std::vector<Float> ret(this->K);
         | 
| 455 458 | 
             
            			auto alphaDoc = getCachedAlpha(doc);
         | 
| @@ -41,7 +41,10 @@ namespace tomoto | |
| 41 41 | 
             
            		friend typename BaseClass::BaseClass;
         | 
| 42 42 | 
             
            		using WeightType = typename BaseClass::WeightType;
         | 
| 43 43 |  | 
| 44 | 
            -
            		static constexpr  | 
| 44 | 
            +
            		static constexpr auto tmid()
         | 
| 45 | 
            +
            		{
         | 
| 46 | 
            +
            			return serializer::to_key("DTM\0");
         | 
| 47 | 
            +
            		}
         | 
| 45 48 |  | 
| 46 49 | 
             
            		uint64_t T;
         | 
| 47 50 | 
             
            		Float shapeA = 0.03f, shapeB = 0.1f, shapeC = 0.55f;
         | 
| @@ -54,7 +57,7 @@ namespace tomoto | |
| 54 57 | 
             
            		std::vector<sample::AliasMethod<>> wordAliasTables; // Dim: (Word * Time)
         | 
| 55 58 |  | 
| 56 59 | 
             
            		template<int _inc>
         | 
| 57 | 
            -
            		inline void addWordTo(_ModelState& ld, _DocType& doc,  | 
| 60 | 
            +
            		inline void addWordTo(_ModelState& ld, _DocType& doc, size_t pid, Vid vid, Tid tid) const
         | 
| 58 61 | 
             
            		{
         | 
| 59 62 | 
             
            			assert(tid < this->K);
         | 
| 60 63 | 
             
            			assert(vid < this->realV);
         | 
| @@ -168,7 +168,7 @@ namespace tomoto | |
| 168 168 | 
             
            		}
         | 
| 169 169 |  | 
| 170 170 | 
             
            		template<int _inc> 
         | 
| 171 | 
            -
            		inline void addWordTo(_ModelState& ld, _DocType& doc,  | 
| 171 | 
            +
            		inline void addWordTo(_ModelState& ld, _DocType& doc, size_t pid, Vid vid, size_t tableId, Tid tid) const
         | 
| 172 172 | 
             
            		{
         | 
| 173 173 | 
             
            			addOnlyWordTo<_inc>(ld, doc, pid, vid, tid);
         | 
| 174 174 | 
             
            			constexpr bool _dec = _inc < 0 && _tw != TermWeight::one;
         | 
| @@ -490,7 +490,7 @@ namespace tomoto | |
| 490 490 | 
             
            			THROW_ERROR_WITH_INFO(exc::Unimplemented, "HDPModel doesn't provide setWordPrior function.");
         | 
| 491 491 | 
             
            		}
         | 
| 492 492 |  | 
| 493 | 
            -
            		std::vector<Float>  | 
| 493 | 
            +
            		std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
         | 
| 494 494 | 
             
            		{
         | 
| 495 495 | 
             
            			std::vector<Float> ret(this->K);
         | 
| 496 496 | 
             
            			Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
         | 
| @@ -522,7 +522,7 @@ namespace tomoto | |
| 522 522 | 
             
            			for (size_t i = 0; i < cntIdx.size(); ++i)
         | 
| 523 523 | 
             
            			{
         | 
| 524 524 | 
             
            				if (i && cntIdx[i].first / sum <= topicThreshold) break;
         | 
| 525 | 
            -
            				newK[cntIdx[i].second] = i;
         | 
| 525 | 
            +
            				newK[cntIdx[i].second] = (Tid)i;
         | 
| 526 526 | 
             
            				liveK++;
         | 
| 527 527 | 
             
            			}
         | 
| 528 528 |  | 
| @@ -558,7 +558,7 @@ namespace tomoto | |
| 558 558 | 
             
            						lda->docs[i].Zs[j] = non_topic_id;
         | 
| 559 559 | 
             
            						continue;
         | 
| 560 560 | 
             
            					}
         | 
| 561 | 
            -
            					 | 
| 561 | 
            +
            					Tid newTopic = newK[this->docs[i].numTopicByTable[this->docs[i].Zs[j]].topic];
         | 
| 562 562 | 
             
            					while (newTopic == (Tid)-1) newTopic = newK[randomTopic(rng)];
         | 
| 563 563 | 
             
            					lda->docs[i].Zs[j] = newTopic;
         | 
| 564 564 | 
             
            				}
         |