RubyGems - tomoto - Versions diffs - 0.1.3 → 0.1.4 - Mend

tomoto 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/LICENSE.txt +1 -1
data/README.md +7 -0
data/ext/tomoto/ct.cpp +54 -0
data/ext/tomoto/dmr.cpp +62 -0
data/ext/tomoto/dt.cpp +82 -0
data/ext/tomoto/ext.cpp +27 -773
data/ext/tomoto/gdmr.cpp +34 -0
data/ext/tomoto/hdp.cpp +42 -0
data/ext/tomoto/hlda.cpp +66 -0
data/ext/tomoto/hpa.cpp +27 -0
data/ext/tomoto/lda.cpp +250 -0
data/ext/tomoto/llda.cpp +29 -0
data/ext/tomoto/mglda.cpp +71 -0
data/ext/tomoto/pa.cpp +27 -0
data/ext/tomoto/plda.cpp +29 -0
data/ext/tomoto/slda.cpp +40 -0
data/ext/tomoto/utils.h +84 -0
data/lib/tomoto/tomoto.bundle +0 -0
data/lib/tomoto/tomoto.so +0 -0
data/lib/tomoto/version.rb +1 -1
data/vendor/tomotopy/README.kr.rst +12 -3
data/vendor/tomotopy/README.rst +12 -3
data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
data/vendor/tomotopy/src/Utils/math.h +8 -4
data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
metadata +24 -60

data/ext/tomoto/mglda.cpp ADDED Viewed

@@ -0,0 +1,71 @@
+#include <MGLDA.h>
+#include <rice/Module.hpp>
+#include "utils.h"
+void init_mglda(Rice::Module& m) {
+  Rice::define_class_under<tomoto::IMGLDAModel, tomoto::ILDAModel>(m, "MGLDA")
+    .define_singleton_method(
+      "_new",
+      *[](size_t tw, size_t k_g, size_t k_l, size_t t, tomoto::Float alpha_g, tomoto::Float alpha_l, tomoto::Float alpha_mg, tomoto::Float alpha_ml, tomoto::Float eta_g) {
+        return tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g);
+      })
+    .define_method(
+      "_add_doc",
+      *[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
+        auto doc = buildDoc(words);
+        doc.misc["delimiter"] = delimiter;
+        return self.addDoc(doc);
+      })
+    .define_method(
+      "alpha_g",
+      *[](tomoto::IMGLDAModel& self) {
+        return self.getAlpha();
+      })
+    .define_method(
+      "alpha_l",
+      *[](tomoto::IMGLDAModel& self) {
+        return self.getAlphaL();
+      })
+    .define_method(
+      "alpha_mg",
+      *[](tomoto::IMGLDAModel& self) {
+        return self.getAlphaM();
+      })
+    .define_method(
+      "alpha_ml",
+      *[](tomoto::IMGLDAModel& self) {
+        return self.getAlphaML();
+      })
+    .define_method(
+      "eta_g",
+      *[](tomoto::IMGLDAModel& self) {
+        return self.getEta();
+      })
+    .define_method(
+      "eta_l",
+      *[](tomoto::IMGLDAModel& self) {
+        return self.getEtaL();
+      })
+    .define_method(
+      "gamma",
+      *[](tomoto::IMGLDAModel& self) {
+        return self.getGamma();
+      })
+    .define_method(
+      "k_g",
+      *[](tomoto::IMGLDAModel& self) {
+        return self.getK();
+      })
+    .define_method(
+      "k_l",
+      *[](tomoto::IMGLDAModel& self) {
+        return self.getKL();
+      })
+    .define_method(
+      "t",
+      *[](tomoto::IMGLDAModel& self) {
+        return self.getT();
+      });
+}

data/ext/tomoto/pa.cpp ADDED Viewed

@@ -0,0 +1,27 @@
+#include <PA.h>
+#include <rice/Module.hpp>
+#include "utils.h"
+void init_pa(Rice::Module& m) {
+  Rice::define_class_under<tomoto::IPAModel, tomoto::ILDAModel>(m, "PA")
+    .define_singleton_method(
+      "_new",
+      *[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, int seed) {
+        if (seed < 0) {
+          seed = std::random_device{}();
+        }
+        return tomoto::IPAModel::create((tomoto::TermWeight)tw, k1, k2, alpha, eta, seed);
+      })
+    .define_method(
+      "k1",
+      *[](tomoto::IPAModel& self) {
+        return self.getK();
+      })
+    .define_method(
+      "k2",
+      *[](tomoto::IPAModel& self) {
+        return self.getK2();
+      });
+}

data/ext/tomoto/plda.cpp ADDED Viewed

@@ -0,0 +1,29 @@
+#include <PLDA.h>
+#include <rice/Module.hpp>
+#include "utils.h"
+void init_plda(Rice::Module& m) {
+  Rice::define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(m, "PLDA")
+    .define_singleton_method(
+      "_new",
+      *[](size_t tw, size_t latent_topics, tomoto::Float alpha, tomoto::Float eta, int seed) {
+        if (seed < 0) {
+          seed = std::random_device{}();
+        }
+        return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, latent_topics, 1, alpha, eta, seed);
+      })
+    .define_method(
+      "_add_doc",
+      *[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
+        auto doc = buildDoc(words);
+        doc.misc["labels"] = labels;
+        return self.addDoc(doc);
+      })
+    .define_method(
+      "latent_topics",
+      *[](tomoto::IPLDAModel& self) {
+        return self.getNumLatentTopics();
+      });
+}

data/ext/tomoto/slda.cpp ADDED Viewed

@@ -0,0 +1,40 @@
+#include <SLDA.h>
+#include <rice/Module.hpp>
+#include "utils.h"
+void init_slda(Rice::Module& m) {
+  Rice::define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(m, "SLDA")
+    .define_singleton_method(
+      "_new",
+      *[](size_t tw, size_t k, Array rb_vars, tomoto::Float alpha, tomoto::Float eta, std::vector<tomoto::Float> mu, std::vector<tomoto::Float> nu_sq, std::vector<tomoto::Float> glm_param, int seed) {
+        if (seed < 0) {
+          seed = std::random_device{}();
+        }
+        std::vector<tomoto::ISLDAModel::GLM> vars;
+        vars.reserve(rb_vars.size());
+        for (auto const& v : rb_vars) {
+          vars.push_back((tomoto::ISLDAModel::GLM) from_ruby<int>(v));
+        }
+        return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, k, vars, alpha, eta, mu, nu_sq, glm_param, seed);
+      })
+    .define_method(
+      "_add_doc",
+      *[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
+        auto doc = buildDoc(words);
+        doc.misc["y"] = y;
+        return self.addDoc(doc);
+      })
+    .define_method(
+      "f",
+      *[](tomoto::ISLDAModel& self) {
+        return self.getF();
+      })
+    .define_method(
+      "_var_type",
+      *[](tomoto::ISLDAModel& self, size_t var_id) {
+        if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
+        return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
+      });
+}

data/ext/tomoto/utils.h ADDED Viewed

@@ -0,0 +1,84 @@
+#pragma once
+#include <rice/Array.hpp>
+using Rice::Array;
+using Rice::Object;
+template<>
+inline
+Object to_ruby<std::vector<tomoto::Float>>(std::vector<tomoto::Float> const & x)
+{
+  Array res;
+  for (auto const& v : x) {
+    res.push(v);
+  }
+  return res;
+}
+template<>
+inline
+Object to_ruby<std::vector<uint32_t>>(std::vector<uint32_t> const & x)
+{
+  Array res;
+  for (auto const& v : x) {
+    res.push(v);
+  }
+  return res;
+}
+template<>
+inline
+Object to_ruby<std::vector<uint64_t>>(std::vector<uint64_t> const & x)
+{
+  Array res;
+  for (auto const& v : x) {
+    res.push(v);
+  }
+  return res;
+}
+template<>
+inline
+std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
+{
+  Array a = Array(x);
+  std::vector<std::string> res;
+  res.reserve(a.size());
+  for (auto const& v : a) {
+    res.push_back(from_ruby<std::string>(v));
+  }
+  return res;
+}
+template<>
+inline
+std::vector<tomoto::Float> from_ruby<std::vector<tomoto::Float>>(Object x)
+{
+  Array a = Array(x);
+  std::vector<tomoto::Float> res;
+  res.reserve(a.size());
+  for (auto const& v : a) {
+    res.push_back(from_ruby<tomoto::Float>(v));
+  }
+  return res;
+}
+template<>
+inline
+std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
+{
+  Array a = Array(x);
+  std::vector<uint64_t> res;
+  res.reserve(a.size());
+  for (auto const& v : a) {
+    res.push_back(from_ruby<uint64_t>(v));
+  }
+  return res;
+}
+inline tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
+  tomoto::RawDoc doc;
+  doc.rawWords = words;
+  return doc;
+}

data/lib/tomoto/tomoto.bundle ADDED Viewed

Binary file

data/lib/tomoto/tomoto.so ADDED Viewed

Binary file

data/lib/tomoto/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Tomoto
-  VERSION = "0.1.3"
+  VERSION = "0.1.4"
 end

data/vendor/tomotopy/README.kr.rst CHANGED Viewed

@@ -35,7 +35,7 @@ tomotopy 란?
 더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
-tomotopy의 가장 최신버전은 0.10.0 입니다.
+tomotopy의 가장 최신버전은 0.10.2 입니다.
 시작하기
 ---------------
@@ -245,7 +245,7 @@ LDA모델로 1000회 iteration을 수행시 걸리는 시간을 초 단위로
 예제 코드
 ---------
-tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/master/examples/ 를 확인하시길 바랍니다.
+tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/main/examples/ 를 확인하시길 바랍니다.
 예제 코드에서 사용했던 데이터 파일은 https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view 에서 다운받을 수 있습니다.
@@ -255,6 +255,16 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 역사
 -------
+* 0.10.2 (2021-02-16)
+    * `tomotopy.CTModel.train`가 큰 K값에 대해 실패하는 문제가 수정되었습니다.
+    * `tomotopy.utils.Corpus`가 `uid`값을 잃는 문제가 수정되었습니다.
+* 0.10.1 (2021-02-14)
+    * `tomotopy.utils.Corpus.extract_ngrams`에 빈 문헌을 입력시 발생하던 에러를 수정했습니다.
+    * `tomotopy.LDAModel.infer`가 올바른 입력에도 예외를 발생시키던 문제를 수정했습니다.
+    * `tomotopy.HLDAModel.infer`가 잘못된 `tomotopy.Document.path` 값을 생성하는 문제를 수정했습니다.
+    * `tomotopy.HLDAModel.train`에 새로운 파라미터 `freeze_topics`가 추가되었습니다. 이를 통해 학습 시 신규 토픽 생성 여부를 조정할 수 있습니다.
 * 0.10.0 (2020-12-19)
     * `tomotopy.utils.Corpus`와 `tomotopy.LDAModel.docs` 간의 인터페이스가 통일되었습니다. 이제 동일한 방법으로 코퍼스 내의 문헌들에 접근할 수 있습니다.
     * `tomotopy.utils.Corpus`의 __getitem__이 개선되었습니다. int 타입 인덱싱뿐만 아니라 Iterable[int]나 slicing를 이용한 다중 인덱싱, uid를 이용한 인덱싱 등이 제공됩니다.
@@ -387,7 +397,6 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 다른 언어용 바인딩
 -------------------
 * Ruby: https://github.com/ankane/tomoto
 포함된 라이브러리들의 라이센스

data/vendor/tomotopy/README.rst CHANGED Viewed

@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
 Please visit https://bab2min.github.io/tomotopy to see more information.
-The most recent version of tomotopy is 0.10.0.
+The most recent version of tomotopy is 0.10.2.
 Getting Started
 ---------------
@@ -250,7 +250,7 @@ See `word_prior_example` in `example.py` for more details.
 Examples
 --------
-You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/master/examples/ .
+You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/main/examples/ .
 You can also get the data file used in the example code at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view .
@@ -261,6 +261,16 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
 History
 -------
+* 0.10.2 (2021-02-16)
+    * An issue was fixed where `tomotopy.CTModel.train` fails with large K.
+    * An issue was fixed where `tomotopy.utils.Corpus` loses their `uid` values.
+* 0.10.1 (2021-02-14)
+    * An issue was fixed where `tomotopy.utils.Corpus.extract_ngrams` craches with empty input.
+    * An issue was fixed where `tomotopy.LDAModel.infer` raises exception with valid input.
+    * An issue was fixed where `tomotopy.HLDAModel.infer` generates wrong `tomotopy.Document.path`.
+    * Since a new parameter `freeze_topics` for `tomotopy.HLDAModel.train` was added, you can control whether to create a new topic or not when training.
 * 0.10.0 (2020-12-19)
     * The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
     * __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
@@ -394,7 +404,6 @@ History
 Bindings for Other Languages
 ------------------------------
 * Ruby: https://github.com/ankane/tomoto
 Bundled Libraries and Their License

data/vendor/tomotopy/src/Labeling/FoRelevance.cpp CHANGED Viewed

@@ -2,6 +2,7 @@
 #include <numeric>
 #include "FoRelevance.h"
+#include "Phraser.hpp"
 using namespace tomoto::label;
@@ -23,6 +24,26 @@ public:
 	{
 		return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
 	}
+	auto begin() const -> decltype(doc->words.begin())
+	{
+		return doc->words.begin();
+	}
+	auto end() const -> decltype(doc->words.end())
+	{
+		return doc->words.end();
+	}
+	auto rbegin() const -> decltype(doc->words.rbegin())
+	{
+		return doc->words.rbegin();
+	}
+	auto rend() const -> decltype(doc->words.rend())
+	{
+		return doc->words.rend();
+	}
 };
 class DocIterator
@@ -61,9 +82,10 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
 {
 	auto& vocabFreqs = tm->getVocabCf();
 	auto& vocabDf = tm->getVocabDf();
-	auto candidates = extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
+	auto candidates = phraser::extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
 		vocabFreqs, vocabDf,
-		candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, -99999.f
+		candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, 0.f,
+		normalized
 	);
 	if (minLabelLen <= 1)
 	{
@@ -77,6 +99,29 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
 	return candidates;
 }
+std::vector<Candidate> tomoto::label::PMIBEExtractor::extract(const ITopicModel* tm) const
+{
+	auto& vocabFreqs = tm->getVocabCf();
+	auto& vocabDf = tm->getVocabDf();
+	auto candidates = phraser::extractPMIBENgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
+		vocabFreqs, vocabDf,
+		candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates,
+		0.f, 0.f
+	);
+	if (minLabelLen <= 1)
+	{
+		for (size_t i = 0; i < vocabDf.size(); ++i)
+		{
+			if (vocabFreqs[i] < candMinCnt) continue;
+			if (vocabDf[i] < candMinDf) continue;
+			candidates.emplace_back(0.f, i);
+		}
+	}
+	return candidates;
+}
 template<bool _lock>
 const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::DocumentBase* doc, const tomoto::Trie<tomoto::Vid, size_t>* root)
 {

data/vendor/tomotopy/src/Labeling/FoRelevance.h CHANGED Viewed

@@ -4,6 +4,7 @@
 #include "Labeler.h"
 #include "../Utils/EigenAddonOps.hpp"
 #include "../Utils/Trie.hpp"
+#include "../Utils/ThreadPool.hpp"
 /*
 Implementation of First-order Relevance for topic labeling by bab2min
@@ -16,166 +17,35 @@ namespace tomoto
 {
 	namespace label
 	{
-		template<typename _DocIter, typename _Freqs>
-		std::vector<Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
-			_Freqs&& vocabFreqs, _Freqs&& vocabDf,
-			size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore)
+		class PMIExtractor : public IExtractor
 		{
-			struct vvhash
-			{
-				size_t operator()(const std::pair<Vid, Vid>& k) const
-				{
-					return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
-				}
-			};
-			// counting unigrams & bigrams
-			std::unordered_map<std::pair<Vid, Vid>, size_t, vvhash> bigramCnt, bigramDf;
-			for(auto docIt = docBegin; docIt != docEnd; ++docIt)
-			{
-				std::unordered_set<std::pair<Vid, Vid>, vvhash> uniqBigram;
-				auto doc = *docIt;
-				Vid prevWord = doc[0];
-				for (size_t j = 1; j < doc.size(); ++j)
-				{
-					Vid curWord = doc[j];
-					if (curWord != non_vocab_id && vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
-					{
-						if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
-						{
-							bigramCnt[std::make_pair(prevWord, curWord)]++;
-							uniqBigram.emplace(prevWord, curWord);
-						}
-					}
-					prevWord = curWord;
-				}
-				for (auto& p : uniqBigram) bigramDf[p]++;
-			}
-			// counting ngrams
-			std::vector<TrieEx<Vid, size_t>> trieNodes;
-			if (maxNgrams > 2)
-			{
-				std::unordered_set<std::pair<Vid, Vid>, vvhash> validPair;
-				for (auto& p : bigramCnt)
-				{
-					if (p.second >= candMinCnt) validPair.emplace(p.first);
-				}
-				trieNodes.resize(1);
-				auto allocNode = [&]() { return trieNodes.emplace_back(), & trieNodes.back(); };
-				for (auto docIt = docBegin; docIt != docEnd; ++docIt)
-				{
-					auto doc = *docIt;
-					if (trieNodes.capacity() < trieNodes.size() + doc.size() * maxNgrams)
-					{
-						trieNodes.reserve(std::max(trieNodes.size() + doc.size() * maxNgrams, trieNodes.capacity() * 2));
-					}
-					Vid prevWord = doc[0];
-					size_t labelLen = 0;
-					auto node = &trieNodes[0];
-					if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt)
-					{
-						node = trieNodes[0].makeNext(prevWord, allocNode);
-						node->val++;
-						labelLen = 1;
-					}
-					for (size_t j = 1; j < doc.size(); ++j)
-					{
-						Vid curWord = doc[j];
-						if (curWord != non_vocab_id && vocabFreqs[curWord] < candMinCnt)
-						{
-							node = &trieNodes[0];
-							labelLen = 0;
-						}
-						else
-						{
-							if (labelLen >= maxNgrams)
-							{
-								node = node->getFail();
-								labelLen--;
-							}
-							if (validPair.count(std::make_pair(prevWord, curWord)))
-							{
-								auto nnode = node->makeNext(curWord, allocNode);
-								node = nnode;
-								do
-								{
-									nnode->val++;
-								} while (nnode = nnode->getFail());
-								labelLen++;
-							}
-							else
-							{
-								node = trieNodes[0].makeNext(curWord, allocNode);
-								node->val++;
-								labelLen = 1;
-							}
-						}
-						prevWord = curWord;
-					}
-				}
-			}
-			float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
-			// calculating PMIs
-			std::vector<Candidate> candidates;
-			for (auto& p : bigramCnt)
-			{
-				auto& bigram = p.first;
-				if (p.second < candMinCnt) continue;
-				if (bigramDf[bigram] < candMinDf) continue;
-				auto pmi = std::log(p.second * totN
-					/ vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
-				if (pmi <= 0) continue;
-				candidates.emplace_back(pmi, bigram.first, bigram.second);
-			}
-			if (maxNgrams > 2)
+			size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
+			bool normalized;
+		public:
+			PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
+				size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000,
+				bool _normalized = false
+			)
+				: candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf },
+				minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen },
+				maxCandidates{ _maxCandidates }, normalized{ _normalized }
 			{
-				std::vector<Vid> rkeys;
-				trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
-				{
-					if (rkeys.size() <= 2 || rkeys.size() < minNgrams || node->val < candMinCnt) return;
-					auto pmi = node->val / totN;
-					for (auto k : rkeys)
-					{
-						pmi *= totN / vocabFreqs[k];
-					}
-					pmi = std::log(pmi);
-					if (pmi < minScore) return;
-					candidates.emplace_back(pmi, rkeys);
-				}, rkeys);
 			}
-			std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
-			{
-				return a.score > b.score;
-			});
-			if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
-			return candidates;
-		}
+			std::vector<Candidate> extract(const ITopicModel* tm) const override;
+		};
-		class PMIExtractor : public IExtractor
+		class PMIBEExtractor : public IExtractor
 		{
 			size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
 		public:
-			PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
-				: candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen}, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
+			PMIBEExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
+				size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000
+			)
+				: candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
 			{
 			}
 			std::vector<Candidate> extract(const ITopicModel* tm) const override;
 		};
@@ -212,7 +82,7 @@ namespace tomoto
 		public:
 			template<typename _Iter>
-			FoRelevance(const ITopicModel* _tm,
+			FoRelevance(const ITopicModel* _tm,
 				_Iter candFirst, _Iter candEnd,
 				size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f,
 				size_t _windowSize = (size_t)-1,