RubyGems - tomoto - Versions diffs - 0.1.3 → 0.1.4 - Mend

tomoto 0.1.3 → 0.1.4

Files changed (50) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/LICENSE.txt +1 -1
data/README.md +7 -0
data/ext/tomoto/ct.cpp +54 -0
data/ext/tomoto/dmr.cpp +62 -0
data/ext/tomoto/dt.cpp +82 -0
data/ext/tomoto/ext.cpp +27 -773
data/ext/tomoto/gdmr.cpp +34 -0
data/ext/tomoto/hdp.cpp +42 -0
data/ext/tomoto/hlda.cpp +66 -0
data/ext/tomoto/hpa.cpp +27 -0
data/ext/tomoto/lda.cpp +250 -0
data/ext/tomoto/llda.cpp +29 -0
data/ext/tomoto/mglda.cpp +71 -0
data/ext/tomoto/pa.cpp +27 -0
data/ext/tomoto/plda.cpp +29 -0
data/ext/tomoto/slda.cpp +40 -0
data/ext/tomoto/utils.h +84 -0
data/lib/tomoto/tomoto.bundle +0 -0
data/lib/tomoto/tomoto.so +0 -0
data/lib/tomoto/version.rb +1 -1
data/vendor/tomotopy/README.kr.rst +12 -3
data/vendor/tomotopy/README.rst +12 -3
data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
data/vendor/tomotopy/src/Utils/math.h +8 -4
data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
metadata +24 -60

data/ext/tomoto/gdmr.cpp ADDED Viewed

@@ -0,0 +1,34 @@
+#include <GDMR.h>
+#include <rice/Module.hpp>
+#include "utils.h"
+void init_gdmr(Rice::Module& m) {
+  Rice::define_class_under<tomoto::IGDMRModel, tomoto::IDMRModel>(m, "GDMR")
+    .define_singleton_method(
+      "_new",
+      *[](size_t tw, size_t k, std::vector<uint64_t> degrees, tomoto::Float alpha, tomoto::Float sigma, tomoto::Float sigma0, tomoto::Float eta, tomoto::Float alpha_epsilon, int seed) {
+        if (seed < 0) {
+          seed = std::random_device{}();
+        }
+        return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
+      })
+    .define_method(
+      "_add_doc",
+      *[](tomoto::IGDMRModel& self, std::vector<std::string> words, std::vector<tomoto::Float> metadata) {
+        auto doc = buildDoc(words);
+        doc.misc["metadata"] = metadata;
+        return self.addDoc(doc);
+      })
+    .define_method(
+      "degrees",
+      *[](tomoto::IGDMRModel& self) {
+        return self.getFs();
+      })
+    .define_method(
+      "sigma0",
+      *[](tomoto::IGDMRModel& self) {
+        return self.getSigma0();
+      });
+}

data/ext/tomoto/hdp.cpp ADDED Viewed

@@ -0,0 +1,42 @@
+#include <HDP.h>
+#include <rice/Module.hpp>
+#include "utils.h"
+void init_hdp(Rice::Module& m) {
+  Rice::define_class_under<tomoto::IHDPModel, tomoto::ILDAModel>(m, "HDP")
+    .define_singleton_method(
+      "_new",
+      *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, tomoto::Float gamma, int seed) {
+        if (seed < 0) {
+          seed = std::random_device{}();
+        }
+        return tomoto::IHDPModel::create((tomoto::TermWeight)tw, k, alpha, eta, gamma, seed);
+      })
+    .define_method(
+      "alpha",
+      *[](tomoto::IHDPModel& self) {
+        return self.getAlpha();
+      })
+    .define_method(
+      "gamma",
+      *[](tomoto::IHDPModel& self) {
+        return self.getGamma();
+      })
+    .define_method(
+      "live_k",
+      *[](tomoto::IHDPModel& self) {
+        return self.getLiveK();
+      })
+    .define_method(
+      "live_topic?",
+      *[](tomoto::IHDPModel& self, size_t tid) {
+        return self.isLiveTopic(tid);
+      })
+    .define_method(
+      "num_tables",
+      *[](tomoto::IHDPModel& self) {
+        return self.getTotalTables();
+      });
+}

data/ext/tomoto/hlda.cpp ADDED Viewed

@@ -0,0 +1,66 @@
+#include <HLDA.h>
+#include <rice/Module.hpp>
+#include "utils.h"
+void init_hlda(Rice::Module& m) {
+  Rice::define_class_under<tomoto::IHLDAModel, tomoto::ILDAModel>(m, "HLDA")
+    .define_singleton_method(
+      "_new",
+      *[](size_t tw, size_t levelDepth, tomoto::Float alpha, tomoto::Float eta, tomoto::Float gamma, int seed) {
+        if (seed < 0) {
+          seed = std::random_device{}();
+        }
+        return tomoto::IHLDAModel::create((tomoto::TermWeight)tw, levelDepth, alpha, eta, gamma, seed);
+      })
+    .define_method(
+      "alpha",
+      *[](tomoto::IHLDAModel& self) {
+        Array res;
+        for (size_t i = 0; i < self.getLevelDepth(); i++) {
+          res.push(self.getAlpha(i));
+        }
+        return res;
+      })
+    .define_method(
+      "_children_topics",
+      *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
+        return self.getChildTopicId(topic_id);
+      })
+    .define_method(
+      "depth",
+      *[](tomoto::IHLDAModel& self) {
+        return self.getLevelDepth();
+      })
+    .define_method(
+      "gamma",
+      *[](tomoto::IHLDAModel& self) {
+        return self.getGamma();
+      })
+    .define_method(
+      "_level",
+      *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
+        return self.getLevelOfTopic(topic_id);
+      })
+    .define_method(
+      "live_k",
+      *[](tomoto::IHLDAModel& self) {
+        return self.getLiveK();
+      })
+    .define_method(
+      "_live_topic?",
+      *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
+        return self.isLiveTopic(topic_id);
+      })
+    .define_method(
+      "_num_docs_of_topic",
+      *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
+        return self.getNumDocsOfTopic(topic_id);
+      })
+    .define_method(
+      "_parent_topic",
+      *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
+        return self.getParentTopicId(topic_id);
+      });
+}

data/ext/tomoto/hpa.cpp ADDED Viewed

@@ -0,0 +1,27 @@
+#include <HPA.h>
+#include <rice/Module.hpp>
+#include "utils.h"
+void init_hpa(Rice::Module& m) {
+  Rice::define_class_under<tomoto::IHPAModel, tomoto::IPAModel>(m, "HPA")
+    .define_singleton_method(
+      "_new",
+      *[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, int seed) {
+        if (seed < 0) {
+          seed = std::random_device{}();
+        }
+        return tomoto::IHPAModel::create((tomoto::TermWeight)tw, false, k1, k2, alpha, eta, seed);
+      })
+    .define_method(
+      "alpha",
+      *[](tomoto::IHPAModel& self) {
+        Array res;
+        // use <= to return k+1 elements
+        for (size_t i = 0; i <= self.getK(); i++) {
+          res.push(self.getAlpha(i));
+        }
+        return res;
+      });
+}

data/ext/tomoto/lda.cpp ADDED Viewed

@@ -0,0 +1,250 @@
+#include <fstream>
+#include <iostream>
+#include <LDA.h>
+#include <rice/Class.hpp>
+#include <rice/Hash.hpp>
+#include <rice/Module.hpp>
+#include "utils.h"
+class DocumentObject
+{
+public:
+  DocumentObject(const tomoto::DocumentBase* _doc, const tomoto::ITopicModel* _tm) : doc{ _doc }, tm{ _tm } {}
+  const tomoto::DocumentBase* doc;
+  const tomoto::ITopicModel* tm;
+};
+void init_lda(Rice::Module& m) {
+  Rice::define_class_under<DocumentObject>(m, "Document")
+    .define_method(
+      "topics",
+      *[](DocumentObject& self) {
+        Rice::Hash res;
+        auto topics = self.tm->getTopicsByDoc(self.doc);
+        for (size_t i = 0; i < topics.size(); i++) {
+          res[i] = topics[i];
+        }
+        return res;
+      });
+  Rice::define_class_under<tomoto::ILDAModel>(m, "LDA")
+    .define_singleton_method(
+      "_new",
+      *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
+        if (seed < 0) {
+          seed = std::random_device{}();
+        }
+        return tomoto::ILDAModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
+      })
+    .define_method(
+      "_add_doc",
+      *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
+        return self.addDoc(buildDoc(words));
+      })
+    .define_method(
+      "alpha",
+      *[](tomoto::ILDAModel& self) {
+        Array res;
+        for (size_t i = 0; i < self.getK(); i++) {
+          res.push(self.getAlpha(i));
+        }
+        return res;
+      })
+    .define_method(
+      "burn_in",
+      *[](tomoto::ILDAModel& self) {
+        return self.getBurnInIteration();
+      })
+    .define_method(
+      "burn_in=",
+      *[](tomoto::ILDAModel& self, size_t iteration) {
+        self.setBurnInIteration(iteration);
+        return iteration;
+      })
+    .define_method(
+      "_count_by_topics",
+      *[](tomoto::ILDAModel& self) {
+        Array res;
+        for (auto const& v : self.getCountByTopic()) {
+          res.push(v);
+        }
+        return res;
+      })
+    .define_method(
+      "docs",
+      *[](tomoto::ILDAModel& self) {
+        Array res;
+        auto n = self.getNumDocs();
+        for (size_t i = 0; i < n; i++) {
+          res.push(DocumentObject(self.getDoc(i), &self));
+        }
+        return res;
+      })
+    .define_method(
+      "eta",
+      *[](tomoto::ILDAModel& self) {
+        return self.getEta();
+      })
+    .define_method(
+      "global_step",
+      *[](tomoto::ILDAModel& self) {
+        return self.getGlobalStep();
+      })
+    .define_method(
+      "k",
+      *[](tomoto::ILDAModel& self) {
+        return self.getK();
+      })
+    .define_method(
+      "_load",
+      *[](tomoto::ILDAModel& self, const char* filename) {
+        std::ifstream str{ filename, std::ios_base::binary };
+        if (!str) throw std::runtime_error{ std::string("cannot open file '") + filename + std::string("'") };
+        std::vector<uint8_t> extra_data;
+        self.loadModel(str, &extra_data);
+      })
+    .define_method(
+      "ll_per_word",
+      *[](tomoto::ILDAModel& self) {
+        return self.getLLPerWord();
+      })
+    .define_method(
+      "num_docs",
+      *[](tomoto::ILDAModel& self) {
+        return self.getNumDocs();
+      })
+    .define_method(
+      "num_vocabs",
+      *[](tomoto::ILDAModel& self) {
+        return self.getV();
+      })
+    .define_method(
+      "num_words",
+      *[](tomoto::ILDAModel& self) {
+        return self.getN();
+      })
+    .define_method(
+      "optim_interval",
+      *[](tomoto::ILDAModel& self) {
+        return self.getOptimInterval();
+      })
+    .define_method(
+      "optim_interval=",
+      *[](tomoto::ILDAModel& self, size_t value) {
+        self.setOptimInterval(value);
+        return value;
+      })
+    .define_method(
+      "perplexity",
+      *[](tomoto::ILDAModel& self) {
+        return self.getPerplexity();
+      })
+    .define_method(
+      "_prepare",
+      *[](tomoto::ILDAModel& self, size_t minCnt, size_t minDf, size_t rmTop) {
+        self.prepare(true, minCnt, minDf, rmTop);
+      })
+    .define_method(
+      "_removed_top_words",
+      *[](tomoto::ILDAModel& self, size_t rmTop) {
+        Array res;
+        auto dict = self.getVocabDict();
+        size_t size = dict.size();
+        for (size_t i = rmTop; i > 0; i--) {
+          res.push(dict.toWord(size - i));
+        }
+        return res;
+      })
+    .define_method(
+      "_save",
+      *[](tomoto::ILDAModel& self, const char* filename, bool full) {
+        std::ofstream str{ filename, std::ios_base::binary };
+        std::vector<uint8_t> extra_data;
+        self.saveModel(str, full, &extra_data);
+      })
+    .define_method(
+      "_topic_words",
+      *[](tomoto::ILDAModel& self, size_t topicId, size_t topN) {
+        Rice::Hash res;
+        for (auto const& v : self.getWordsByTopicSorted(topicId, topN)) {
+          res[v.first] = v.second;
+        }
+        return res;
+      })
+    .define_method(
+      "_train",
+      *[](tomoto::ILDAModel& self, size_t iteration, size_t workers, size_t ps) {
+        self.train(iteration, workers, (tomoto::ParallelScheme)ps);
+      })
+    .define_method(
+      "_tw",
+      *[](tomoto::ILDAModel& self) {
+        return (int)self.getTermWeight();
+      })
+    .define_method(
+      "used_vocab_df",
+      *[](tomoto::ILDAModel& self) {
+        auto vocab = self.getVocabDf();
+        Array res;
+        for (size_t i = 0; i < self.getV(); i++) {
+          res.push(vocab[i]);
+        }
+        return res;
+      })
+    .define_method(
+      "used_vocab_freq",
+      *[](tomoto::ILDAModel& self) {
+        auto vocab = self.getVocabCf();
+        Array res;
+        for (size_t i = 0; i < self.getV(); i++) {
+          res.push(vocab[i]);
+        }
+        return res;
+      })
+    .define_method(
+      "used_vocabs",
+      *[](tomoto::ILDAModel& self) {
+        auto dict = self.getVocabDict();
+        Array res;
+        auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
+        for (size_t i = 0; i < self.getV(); i++) {
+          res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
+        }
+        return res;
+      })
+    .define_method(
+      "vocab_df",
+      *[](tomoto::ILDAModel& self) {
+        auto vocab = self.getVocabDf();
+        Array res;
+        for (size_t i = 0; i < vocab.size(); i++) {
+          res.push(vocab[i]);
+        }
+        return res;
+      })
+    .define_method(
+      "vocab_freq",
+      *[](tomoto::ILDAModel& self) {
+        auto vocab = self.getVocabCf();
+        Array res;
+        for (size_t i = 0; i < vocab.size(); i++) {
+          res.push(vocab[i]);
+        }
+        return res;
+      })
+    .define_method(
+      "vocabs",
+      *[](tomoto::ILDAModel& self) {
+        auto dict = self.getVocabDict();
+        Array res;
+        auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
+        for (size_t i = 0; i < dict.size(); i++) {
+          res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
+        }
+        return res;
+      });
+}

data/ext/tomoto/llda.cpp ADDED Viewed

@@ -0,0 +1,29 @@
+#include <LLDA.h>
+#include <rice/Module.hpp>
+#include "utils.h"
+void init_llda(Rice::Module& m) {
+  Rice::define_class_under<tomoto::ILLDAModel, tomoto::ILDAModel>(m, "LLDA")
+    .define_singleton_method(
+      "_new",
+      *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
+        if (seed < 0) {
+          seed = std::random_device{}();
+        }
+        return tomoto::ILLDAModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
+      })
+    .define_method(
+      "_add_doc",
+      *[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
+        auto doc = buildDoc(words);
+        doc.misc["labels"] = labels;
+        return self.addDoc(doc);
+      })
+    .define_method(
+      "topics_per_label",
+      *[](tomoto::ILLDAModel& self) {
+        return self.getNumTopicsPerLabel();
+      });
+}