RubyGems - tomoto - Versions diffs - 0.1.2 → 0.1.3 - Mend

tomoto 0.1.2 → 0.1.3

Files changed (45) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/LICENSE.txt +1 -1
data/README.md +3 -3
data/ext/tomoto/ext.cpp +34 -9
data/ext/tomoto/extconf.rb +2 -1
data/lib/tomoto/dmr.rb +1 -1
data/lib/tomoto/gdmr.rb +1 -1
data/lib/tomoto/version.rb +1 -1
data/vendor/tomotopy/LICENSE +1 -1
data/vendor/tomotopy/README.kr.rst +32 -3
data/vendor/tomotopy/README.rst +30 -1
data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
data/vendor/tomotopy/src/Utils/math.h +1 -1
data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
data/vendor/variant/LICENSE +25 -0
data/vendor/variant/LICENSE_1_0.txt +23 -0
data/vendor/variant/README.md +102 -0
data/vendor/variant/include/mapbox/optional.hpp +74 -0
data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
data/vendor/variant/include/mapbox/variant.hpp +974 -0
data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
metadata +15 -7

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: dd4c36ff621f73c38bb066694a932f0a682c18591ddf05a9a0764bea0b6e4430
-  data.tar.gz: 551e56c4bc17fb5a3a0aeac0db055960fcc5e45bf097bf88c7cbf9046f958e7d
+  metadata.gz: 370ff6a569a6e476137cda0db349cf1b22ff0e7d54b0df67500ffbbe46846a86
+  data.tar.gz: 8df2fb740a85f8a51485e992dfacd6bef4d3c185d8d08369c72d93b9bb412f6d
 SHA512:
-  metadata.gz: 565a91d0bb6d48142f38dc3d9e798ddb99bf41fda32762295362075fba972eea6b56b6bde126eab74677eba5fd525581b68c5efa73361a46fcb0b2796ab63684
-  data.tar.gz: 415193e4eb6adbe5dce05328aadf9acb91f4acc50951484183a956455d7336f93961fe145465b1eeffaae78dad37ee1452defe832514c72b3c032860ed433cc8
+  metadata.gz: 17abc53d6923a1fb24bfd040967944884cd7e1595ff4fb82c7af46a4bfae858875b2819b7e05fd2ff92e7fe17eec1350c551aa98bf88eedb344cde00a780f32a
+  data.tar.gz: 8f504f89679cf072fee04ac5b0319e164a509d140e65dd6f1c77e6fc9ecef252ce4ece0ac80087d0d69ddccf2a5b002910db58bb5212c4c2c7fa5d206b60d422

data/CHANGELOG.md CHANGED

@@ -1,3 +1,7 @@
+## 0.1.3 (2020-12-19)
+- Updated tomoto to 0.10.0
 ## 0.1.2 (2020-10-10)
 - Added `summary` method

data/LICENSE.txt CHANGED

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2019
+Copyright (c) 2019, bab2min
 Copyright (c) 2020 Andrew Kane
 Permission is hereby granted, free of charge, to any person obtaining a copy

data/README.md CHANGED

@@ -2,7 +2,7 @@
 :tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
-[![Build Status](https://travis-ci.org/ankane/tomoto.svg?branch=master)](https://travis-ci.org/ankane/tomoto)
+[![Build Status](https://github.com/ankane/tomoto/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tomoto/actions)
 ## Installation
@@ -19,7 +19,7 @@ It can take 10-20 minutes to compile the extension.
 Train a model
 ```ruby
-model = Tomoto::LDA.new(k: 3)
+model = Tomoto::LDA.new(k: 2)
 model.add_doc("text from document one")
 model.add_doc("text from document two")
 model.add_doc("text from document three")
@@ -98,7 +98,7 @@ If a method or option you need isn’t supported, feel free to open an issue.
 ## Examples
 - [LDA](examples/lda_basic.rb)
-- [HDP](examples/hdp.rb)
+- [HDP](examples/hdp_basic.rb)
 ## Tokenization

data/ext/tomoto/ext.cpp CHANGED

@@ -96,6 +96,12 @@ std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
   return res;
 }
+tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
+  tomoto::RawDoc doc;
+  doc.rawWords = words;
+  return doc;
+}
 extern "C"
 void Init_ext()
 {
@@ -126,7 +132,7 @@ void Init_ext()
     .define_method(
       "_add_doc",
       *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
-        self.addDoc(words);
+        self.addDoc(buildDoc(words));
       })
     .define_method(
       "alpha",
@@ -379,8 +385,10 @@ void Init_ext()
       })
     .define_method(
       "_add_doc",
-      *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<std::string> metadata) {
-        self.addDoc(words, metadata);
+      *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::string metadata) {
+        auto doc = buildDoc(words);
+        doc.misc["metadata"] = metadata;
+        self.addDoc(doc);
       })
     .define_method(
       "alpha_epsilon",
@@ -433,8 +441,10 @@ void Init_ext()
       })
     .define_method(
       "_add_doc",
-      *[](tomoto::IDTModel& self, std::vector<std::string> words, size_t timepoint) {
-        self.addDoc(words, timepoint);
+      *[](tomoto::IDTModel& self, std::vector<std::string> words, uint32_t timepoint) {
+        auto doc = buildDoc(words);
+        doc.misc["timepoint"] = timepoint;
+        self.addDoc(doc);
       })
     .define_method(
       "lr_a",
@@ -489,6 +499,13 @@ void Init_ext()
         }
         return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
       })
+    .define_method(
+      "_add_doc",
+      *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<tomoto::Float> metadata) {
+        auto doc = buildDoc(words);
+        doc.misc["metadata"] = metadata;
+        self.addDoc(doc);
+      })
     .define_method(
       "degrees",
       *[](tomoto::IGDMRModel& self) {
@@ -643,7 +660,9 @@ void Init_ext()
     .define_method(
       "_add_doc",
       *[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
-        self.addDoc(words, delimiter);
+        auto doc = buildDoc(words);
+        doc.misc["delimiter"] = delimiter;
+        self.addDoc(doc);
       })
     .define_method(
       "alpha_g",
@@ -708,7 +727,9 @@ void Init_ext()
     .define_method(
       "_add_doc",
       *[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
-        self.addDoc(words, labels);
+        auto doc = buildDoc(words);
+        doc.misc["labels"] = labels;
+        self.addDoc(doc);
       })
     .define_method(
       "topics_per_label",
@@ -728,7 +749,9 @@ void Init_ext()
     .define_method(
       "_add_doc",
       *[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
-        self.addDoc(words, labels);
+        auto doc = buildDoc(words);
+        doc.misc["labels"] = labels;
+        self.addDoc(doc);
       })
     .define_method(
       "latent_topics",
@@ -753,7 +776,9 @@ void Init_ext()
     .define_method(
       "_add_doc",
       *[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
-        self.addDoc(words, y);
+        auto doc = buildDoc(words);
+        doc.misc["y"] = y;
+        self.addDoc(doc);
       })
     .define_method(
       "f",

data/ext/tomoto/extconf.rb CHANGED

@@ -23,9 +23,10 @@ ext = File.expand_path(".", __dir__)
 tomoto = File.expand_path("../../vendor/tomotopy/src/TopicModel", __dir__)
 eigen = File.expand_path("../../vendor/eigen", __dir__)
 eigen_rand = File.expand_path("../../vendor/EigenRand", __dir__)
+variant = File.expand_path("../../vendor/variant/include", __dir__)
 $srcs = Dir["{#{ext},#{tomoto}}/*.cpp"]
-$INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand}"
+$INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand} -I#{variant}"
 $VPATH << tomoto
 create_makefile("tomoto/ext")

data/lib/tomoto/dmr.rb CHANGED

@@ -9,7 +9,7 @@ module Tomoto
     end
     def add_doc(doc, metadata: "")
-      _add_doc(prepare_doc(doc), [metadata])
+      _add_doc(prepare_doc(doc), metadata)
     end
     def lambdas

data/lib/tomoto/gdmr.rb CHANGED

@@ -9,7 +9,7 @@ module Tomoto
     end
     def add_doc(doc, metadata: [])
-      _add_doc(prepare_doc(doc), metadata.map(&:to_s))
+      _add_doc(prepare_doc(doc), metadata)
     end
   end
 end

data/lib/tomoto/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Tomoto
-  VERSION = "0.1.2"
+  VERSION = "0.1.3"
 end

data/vendor/tomotopy/LICENSE CHANGED

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2019
+Copyright (c) 2019, bab2min
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

data/vendor/tomotopy/README.kr.rst CHANGED

@@ -35,7 +35,7 @@ tomotopy 란?
 더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
-tomotopy의 가장 최신버전은 0.9.1 입니다.
+tomotopy의 가장 최신버전은 0.10.0 입니다.
 시작하기
 ---------------
@@ -255,6 +255,17 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 역사
 -------
+* 0.10.0 (2020-12-19)
+    * `tomotopy.utils.Corpus`와 `tomotopy.LDAModel.docs` 간의 인터페이스가 통일되었습니다. 이제 동일한 방법으로 코퍼스 내의 문헌들에 접근할 수 있습니다.
+    * `tomotopy.utils.Corpus`의 __getitem__이 개선되었습니다. int 타입 인덱싱뿐만 아니라 Iterable[int]나 slicing를 이용한 다중 인덱싱, uid를 이용한 인덱싱 등이 제공됩니다.
+    * `tomotopy.utils.Corpus.extract_ngrams`와 `tomotopy.utils.Corpus.concat_ngrams`이 추가되었습니다. PMI를 이용해 코퍼스 내에서 자동으로 n-gram collocation을 발견해 한 단어로 합치는 기능을 수행합니다.
+    * `tomotopy.LDAModel.add_corpus`가 추가되었고, `tomotopy.LDAModel.infer`가 Raw 코퍼스를 입력으로 받을 수 있게 되었습니다.
+    * `tomotopy.coherence` 모듈이 추가되었습니다. 생성된 토픽 모델의 coherence를 계산하는 기능을 담당합니다.
+    * `tomotopy.label.FoRelevance`에 window_size 파라미터가 추가되었습니다.
+    * `tomotopy.HDPModel` 학습 시 종종 NaN이 발생하는 문제를 해결했습니다.
+    * 이제 Python3.9를 지원합니다.
+    * py-cpuinfo에 대한 의존성이 제거되고, 모듈 로딩속도가 개선되었습니다.
 * 0.9.1 (2020-08-08)
     * 0.9.0 버전의 메모리 누수 문제가 해결되었습니다.
     * `tomotopy.CTModel.summary()`가 잘못된 결과를 출력하는 문제가 해결되었습니다.
@@ -277,7 +288,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 * 0.8.2 (2020-07-14)
     * `tomotopy.DTModel.num_timepoints`와 `tomotopy.DTModel.num_docs_by_timepoint` 프로퍼티가 추가되었습니다.
-    * `seed`가 동일해서 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.
+    * `seed`가 동일해도 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.
 * 0.8.1 (2020-06-08)
     * `tomotopy.LDAModel.used_vocabs`가 잘못된 값을 반환하는 버그가 수정되었습니다.
@@ -302,7 +313,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 * 0.7.0 (2020-04-18)
     * `tomotopy.DTModel`이 추가되었습니다.
     * `tomotopy.utils.Corpus.save`가 제대로 작동하지 않는 버그가 수정되었습니다.
-    * `tomotopy.LDAModel.get_count_vector`가 추가되었습니다.
+    * `tomotopy.Document.get_count_vector`가 추가되었습니다.
     * 리눅스용 바이너리가 manylinux2010 버전으로 변경되었고 이에 따른 최적화가 진행되었습니다.
 * 0.6.2 (2020-03-28)
@@ -373,3 +384,21 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 * 0.1.0 (2019-05-12)
     * **tomotopy**의 최초 버전
+다른 언어용 바인딩
+-------------------
+* Ruby: https://github.com/ankane/tomoto
+포함된 라이브러리들의 라이센스
+-------------------------------
+* Eigen:
+  This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
+  A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
+  The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
+* EigenRand: `MIT License
+  <licenses_bundled/EigenRand>`_
+* Mapbox Variant: `BSD License
+  <licenses_bundled/MapboxVariant>`_

data/vendor/tomotopy/README.rst CHANGED

@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
 Please visit https://bab2min.github.io/tomotopy to see more information.
-The most recent version of tomotopy is 0.9.1.
+The most recent version of tomotopy is 0.10.0.
 Getting Started
 ---------------
@@ -261,6 +261,17 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
 History
 -------
+* 0.10.0 (2020-12-19)
+    * The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
+    * __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
+    * New methods `tomotopy.utils.Corpus.extract_ngrams` and `tomotopy.utils.Corpus.concat_ngrams` were added. They extracts n-gram collocations using PMI and concatenates them into a single words.
+    * A new method `tomotopy.LDAModel.add_corpus` was added, and `tomotopy.LDAModel.infer` can receive corpus as input.
+    * A new module `tomotopy.coherence` was added. It provides the way to calculate coherence of the model.
+    * A paramter `window_size` was added to `tomotopy.label.FoRelevance`.
+    * An issue was fixed where NaN often occurs when training `tomotopy.HDPModel`.
+    * Now Python3.9 is supported.
+    * A dependency to py-cpuinfo was removed and the initializing of the module was improved.
 * 0.9.1 (2020-08-08)
     * Memory leaks of version 0.9.0 was fixed.
     * `tomotopy.CTModel.summary()` was fixed.
@@ -380,3 +391,21 @@ History
 * 0.1.0 (2019-05-12)
     * First version of **tomotopy**
+Bindings for Other Languages
+------------------------------
+* Ruby: https://github.com/ankane/tomoto
+Bundled Libraries and Their License
+------------------------------------
+* Eigen:
+  This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
+  A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
+  The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
+* EigenRand: `MIT License
+  <licenses_bundled/EigenRand>`_
+* Mapbox Variant: `BSD License
+  <licenses_bundled/MapboxVariant>`_

data/vendor/tomotopy/src/Labeling/FoRelevance.cpp CHANGED

@@ -5,161 +5,74 @@
 using namespace tomoto::label;
-namespace std
+class DocWrapper
 {
-	template <>
-	struct hash<pair<tomoto::Vid, tomoto::Vid>>
+	const tomoto::DocumentBase* doc;
+public:
+	DocWrapper(const tomoto::DocumentBase* _doc = nullptr)
+		: doc{ _doc }
 	{
-		size_t operator()(const pair<tomoto::Vid, tomoto::Vid>& k) const
-		{
-			return hash<tomoto::Vid>{}(k.first) ^ hash<tomoto::Vid>{}(k.second);
-		}
-	};
-}
-std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel * tm) const
-{
-	auto& vocabFreqs = tm->getVocabCf();
-	auto& vocabDf = tm->getVocabDf();
-	// counting unigrams & bigrams
-	std::unordered_map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf;
+	}
-	for (size_t i = 0; i < tm->getNumDocs(); ++i)
+	size_t size() const
 	{
-		std::unordered_set<std::pair<Vid, Vid>> uniqBigram;
-		auto doc = tm->getDoc(i);
-		Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
-		for (size_t j = 1; j < doc->words.size(); ++j)
-		{
-			Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
-			if (vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
-			{
-				if (vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
-				{
-					bigramCnt[std::make_pair(prevWord, curWord)]++;
-					uniqBigram.emplace(prevWord, curWord);
-				}
-			}
-			prevWord = curWord;
-		}
-		for (auto& p : uniqBigram) bigramDf[p]++;
+		return doc->words.size();
 	}
-	// counting ngrams
-	std::vector<TrieEx<Vid, size_t>> trieNodes;
-	if (maxLabelLen > 2)
+	tomoto::Vid operator[](size_t idx) const
 	{
-		std::unordered_set<std::pair<Vid, Vid>> validPair;
-		for (auto& p : bigramCnt)
-		{
-			if (p.second >= candMinCnt) validPair.emplace(p.first);
-		}
-		trieNodes.resize(1);
-		auto allocNode = [&]() { return trieNodes.emplace_back(), &trieNodes.back(); };
-		for (size_t i = 0; i < tm->getNumDocs(); ++i)
-		{
-			auto doc = tm->getDoc(i);
-			if (trieNodes.capacity() < trieNodes.size() + doc->words.size() * maxLabelLen)
-			{
-				trieNodes.reserve(std::max(trieNodes.size() + doc->words.size() * maxLabelLen, trieNodes.capacity() * 2));
-			}
-			Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
-			size_t labelLen = 0;
-			auto node = &trieNodes[0];
-			if (vocabFreqs[prevWord] >= candMinCnt)
-			{
-				node = trieNodes[0].makeNext(prevWord, allocNode);
-				node->val++;
-				labelLen = 1;
-			}
-			for (size_t j = 1; j < doc->words.size(); ++j)
-			{
-				Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
+		return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
+	}
+};
-				if (vocabFreqs[curWord] < candMinCnt)
-				{
-					node = &trieNodes[0];
-					labelLen = 0;
-				}
-				else
-				{
-					if (labelLen >= maxLabelLen)
-					{
-						node = node->getFail();
-						labelLen--;
-					}
+class DocIterator
+{
+	const tomoto::ITopicModel* tm;
+	size_t idx;
+public:
+	DocIterator(const tomoto::ITopicModel* _tm = nullptr, size_t _idx = 0)
+		: tm{ _tm }, idx{ _idx }
+	{
+	}
-					if (validPair.count(std::make_pair(prevWord, curWord)))
-					{
-						auto nnode = node->makeNext(curWord, allocNode);
-						node = nnode;
-						do
-						{
-							nnode->val++;
-						} while (nnode = nnode->getFail());
-						labelLen++;
-					}
-					else
-					{
-						node = trieNodes[0].makeNext(curWord, allocNode);
-						node->val++;
-						labelLen = 1;
-					}
-				}
-				prevWord = curWord;
-			}
-		}
+	DocWrapper operator*() const
+	{
+		return { tm->getDoc(idx) };
 	}
-	// calculating PMIs
-	std::vector<Candidate> candidates;
-	for (auto& p : bigramCnt)
+	DocIterator& operator++()
 	{
-		auto& bigram = p.first;
-		if (p.second < candMinCnt) continue;
-		if (bigramDf[bigram] < candMinDf) continue;
-		auto pmi = std::log(p.second * (float)tm->getN()
-			/ vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
-		if (pmi <= 0) continue;
-		candidates.emplace_back(pmi, bigram.first, bigram.second);
+		++idx;
+		return *this;
 	}
-	if (maxLabelLen > 2)
+	bool operator==(const DocIterator& o) const
 	{
-		std::vector<Vid> rkeys;
-		trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
-		{
-			if (rkeys.size() <= 2 || node->val < candMinCnt) return;
-			float n = (float)tm->getN();
-			auto pmi = node->val / n;
-			for (auto k : rkeys)
-			{
-				pmi *= n / vocabFreqs[k];
-			}
-			pmi = std::log(pmi);
-			candidates.emplace_back(pmi, rkeys);
-		}, rkeys);
+		return tm == o.tm && idx == o.idx;
 	}
-	std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
+	bool operator!=(const DocIterator& o) const
 	{
-		return a.score > b.score;
-	});
-	if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
+		return tm != o.tm || idx != o.idx;
+	}
+};
-	for (size_t i = 0; i < vocabDf.size(); ++i)
+std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) const
+{
+	auto& vocabFreqs = tm->getVocabCf();
+	auto& vocabDf = tm->getVocabDf();
+	auto candidates = extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
+		vocabFreqs, vocabDf,
+		candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, -99999.f
+	);
+	if (minLabelLen <= 1)
 	{
-		if (vocabFreqs[i] < candMinCnt) continue;
-		if (vocabDf[i] < candMinDf) continue;
-		candidates.emplace_back(0.f, i);
+		for (size_t i = 0; i < vocabDf.size(); ++i)
+		{
+			if (vocabFreqs[i] < candMinCnt) continue;
+			if (vocabDf[i] < candMinDf) continue;
+			candidates.emplace_back(0.f, i);
+		}
 	}
 	return candidates;
 }
@@ -172,8 +85,7 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
 	auto node = root;
 	for (size_t j = 0; j < doc->words.size(); ++j)
 	{
-		size_t t = doc->wOrder.empty() ? j : doc->wOrder[j];
-		tomoto::Vid curWord = doc->words[t];
+		tomoto::Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
 		if (curWord < tm->getV()) bdf[curWord] = 1;
 		auto nnode = node->getNext(curWord);
 		while (!nnode)
@@ -191,16 +103,15 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
 				// the matched candidate is found
 				if (nnode->val && nnode->val != (size_t)-1)
 				{
-					auto& c = candidates[nnode->val - 1];
 					tomoto::OptionalLock<_lock> lock{ mtx[(nnode->val - 1) % (pool ? pool->getNumWorkers() : 1)] };
+					auto& c = candidates[nnode->val - 1];
 					if (c.name.empty() && !doc->origWordPos.empty())
 					{
 						size_t start = doc->origWordPos[j + 1 - c.w.size()];
 						size_t end = doc->origWordPos[j] + doc->origWordLen[j];
 						c.names[doc->rawStr.substr(start, end - start)]++;
 					}
-					auto& docIds = c.docIds;
-					if (docIds.empty() || docIds.back() != docId) docIds.emplace_back(docId);
+					c.docIds.emplace(docId);
 				}
 			} while (nnode = nnode->getFail());
 		}
@@ -268,7 +179,22 @@ void FoRelevance::estimateContexts()
 		wordTopicDist.col(i) = Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ dist.data(), (Eigen::Index)dist.size() };
 	}
-	auto calcScores = [&](CandidateEx& c)
+	size_t totDocCnt = 0;
+	if (windowSize == (size_t)-1)
+	{
+		totDocCnt = tm->getNumDocs();
+	}
+	else
+	{
+		for (size_t i = 0; i < tm->getNumDocs(); ++i)
+		{
+			size_t s = tm->getDoc(i)->words.size();
+			if (s <= windowSize) totDocCnt += 1;
+			else totDocCnt += s - windowSize + 1;
+		}
+	}
+	auto calcScores = [&](CandidateEx& c, size_t windowSize)
 	{
 		if (c.docIds.size() < candMinDf) return;
 		if (c.name.empty() && !c.names.empty())
@@ -284,20 +210,80 @@ void FoRelevance::estimateContexts()
 			}
 		}
+		size_t docCnt = 0;
 		Eigen::Matrix<Float, -1, 1> wcPMI = Eigen::Matrix<Float, -1, 1>::Zero(this->tm->getV());
 		for (auto& docId : c.docIds)
 		{
 			thread_local Eigen::VectorXi bdf(this->tm->getV());
 			bdf.setZero();
 			auto doc = this->tm->getDoc(docId);
-			for (size_t i = 0; i < doc->words.size(); ++i)
+			if (doc->words.size() <= windowSize)
 			{
-				if (doc->words[i] < this->tm->getV()) bdf[doc->words[i]] = 1;
+				for (size_t i = 0; i < doc->words.size(); ++i)
+				{
+					if (doc->words[i] < this->tm->getV()) bdf[doc->words[i]] = 1;
+				}
+				docCnt++;
+				wcPMI += bdf.template cast<Float>();
+			}
+			else
+			{
+				auto wit = c.w.begin();
+				std::deque<size_t> wpos;
+				for (size_t i = 0; i < windowSize; ++i)
+				{
+					Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
+					if (word < this->tm->getV()) bdf[word]++;
+					if (word == *wit)
+					{
+						if (++wit == c.w.end())
+						{
+							wpos.emplace_back(i + 1);
+							wit = c.w.begin();
+						}
+					}
+					else if (word == c.w[0]) wit = c.w.begin() + 1;
+					else wit = c.w.begin();
+				}
+				if (!wpos.empty())
+				{
+					docCnt++;
+					wcPMI += Eigen::bool2float(bdf.array()).matrix();
+				}
+				for (size_t i = windowSize; i < doc->words.size(); ++i)
+				{
+					Vid oword = doc->words[doc->wOrder.empty() ? (i - windowSize) : doc->wOrder[i - windowSize]];
+					Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
+					if (oword < this->tm->getV()) bdf[oword]--;
+					if (word < this->tm->getV()) bdf[word]++;
+					if (!wpos.empty() && wpos.front() - c.w.size() <= i - windowSize)
+					{
+						wpos.pop_front();
+					}
+					if (word == *wit)
+					{
+						if (++wit == c.w.end())
+						{
+							wpos.emplace_back(i + 1);
+							wit = c.w.begin();
+						}
+					}
+					else if (word == c.w[0]) wit = c.w.begin() + 1;
+					else wit = c.w.begin();
+					if (!wpos.empty())
+					{
+						docCnt++;
+						wcPMI += Eigen::bool2float(bdf.array()).matrix();
+					}
+				}
 			}
-			wcPMI += bdf.cast<Float>();
 		}
 		c.scores = wordTopicDist.transpose() *
-			((wcPMI.array() + smoothing) * this->tm->getNumDocs() / c.docIds.size() / df.cast<Float>()).log().matrix();
+			((wcPMI.array() + smoothing) * totDocCnt / docCnt / df.cast<Float>()).log().matrix();
 	};
 	if (pool)
@@ -311,7 +297,7 @@ void FoRelevance::estimateContexts()
 			{
 				for (size_t i = g; i < candidates.size(); i += groups)
 				{
-					calcScores(candidates[i]);
+					calcScores(candidates[i], windowSize);
 				}
 			}, g));
 		}
@@ -321,7 +307,7 @@ void FoRelevance::estimateContexts()
 	{
 		for (auto& c : candidates)
 		{
-			calcScores(c);
+			calcScores(c, windowSize);
 		}
 	}