RubyGems - tomoto - Versions diffs - 0.1.2 → 0.1.3 - Mend

tomoto 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/LICENSE.txt +1 -1
data/README.md +3 -3
data/ext/tomoto/ext.cpp +34 -9
data/ext/tomoto/extconf.rb +2 -1
data/lib/tomoto/dmr.rb +1 -1
data/lib/tomoto/gdmr.rb +1 -1
data/lib/tomoto/version.rb +1 -1
data/vendor/tomotopy/LICENSE +1 -1
data/vendor/tomotopy/README.kr.rst +32 -3
data/vendor/tomotopy/README.rst +30 -1
data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
data/vendor/tomotopy/src/Utils/math.h +1 -1
data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
data/vendor/variant/LICENSE +25 -0
data/vendor/variant/LICENSE_1_0.txt +23 -0
data/vendor/variant/README.md +102 -0
data/vendor/variant/include/mapbox/optional.hpp +74 -0
data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
data/vendor/variant/include/mapbox/variant.hpp +974 -0
data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
metadata +15 -7

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: dd4c36ff621f73c38bb066694a932f0a682c18591ddf05a9a0764bea0b6e4430
-  data.tar.gz: 551e56c4bc17fb5a3a0aeac0db055960fcc5e45bf097bf88c7cbf9046f958e7d
+  metadata.gz: 370ff6a569a6e476137cda0db349cf1b22ff0e7d54b0df67500ffbbe46846a86
+  data.tar.gz: 8df2fb740a85f8a51485e992dfacd6bef4d3c185d8d08369c72d93b9bb412f6d
 SHA512:
-  metadata.gz: 565a91d0bb6d48142f38dc3d9e798ddb99bf41fda32762295362075fba972eea6b56b6bde126eab74677eba5fd525581b68c5efa73361a46fcb0b2796ab63684
-  data.tar.gz: 415193e4eb6adbe5dce05328aadf9acb91f4acc50951484183a956455d7336f93961fe145465b1eeffaae78dad37ee1452defe832514c72b3c032860ed433cc8
+  metadata.gz: 17abc53d6923a1fb24bfd040967944884cd7e1595ff4fb82c7af46a4bfae858875b2819b7e05fd2ff92e7fe17eec1350c551aa98bf88eedb344cde00a780f32a
+  data.tar.gz: 8f504f89679cf072fee04ac5b0319e164a509d140e65dd6f1c77e6fc9ecef252ce4ece0ac80087d0d69ddccf2a5b002910db58bb5212c4c2c7fa5d206b60d422

data/CHANGELOG.md CHANGED

@@ -1,3 +1,7 @@
+## 0.1.3 (2020-12-19)
+- Updated tomoto to 0.10.0
 ## 0.1.2 (2020-10-10)
 - Added `summary` method

data/LICENSE.txt CHANGED

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2019
+Copyright (c) 2019, bab2min
 Copyright (c) 2020 Andrew Kane
 Permission is hereby granted, free of charge, to any person obtaining a copy

data/README.md CHANGED

@@ -2,7 +2,7 @@
 :tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
-[![Build Status](https://travis-ci.org/ankane/tomoto.svg?branch=master)](https://travis-ci.org/ankane/tomoto)
+[![Build Status](https://github.com/ankane/tomoto/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tomoto/actions)
 ## Installation
@@ -19,7 +19,7 @@ It can take 10-20 minutes to compile the extension.
 Train a model
 ```ruby
-model = Tomoto::LDA.new(k: 3)
+model = Tomoto::LDA.new(k: 2)
 model.add_doc("text from document one")
 model.add_doc("text from document two")
 model.add_doc("text from document three")
@@ -98,7 +98,7 @@ If a method or option you need isn’t supported, feel free to open an issue.
 ## Examples
 - [LDA](examples/lda_basic.rb)
-- [HDP](examples/hdp.rb)
+- [HDP](examples/hdp_basic.rb)
 ## Tokenization

data/ext/tomoto/ext.cpp CHANGED

@@ -96,6 +96,12 @@ std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
   return res;
 }
+tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
+  tomoto::RawDoc doc;
+  doc.rawWords = words;
+  return doc;
+}
 extern "C"
 void Init_ext()
 {
@@ -126,7 +132,7 @@ void Init_ext()
     .define_method(
       "_add_doc",
       *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
-        self.addDoc(words);
+        self.addDoc(buildDoc(words));
       })
     .define_method(
       "alpha",
@@ -379,8 +385,10 @@ void Init_ext()
       })
     .define_method(
       "_add_doc",
-      *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<std::string> metadata) {
-        self.addDoc(words, metadata);
+      *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::string metadata) {
+        auto doc = buildDoc(words);
+        doc.misc["metadata"] = metadata;
+        self.addDoc(doc);
       })
     .define_method(
       "alpha_epsilon",
@@ -433,8 +441,10 @@ void Init_ext()
       })
     .define_method(
       "_add_doc",
-      *[](tomoto::IDTModel& self, std::vector<std::string> words, size_t timepoint) {
-        self.addDoc(words, timepoint);
+      *[](tomoto::IDTModel& self, std::vector<std::string> words, uint32_t timepoint) {
+        auto doc = buildDoc(words);
+        doc.misc["timepoint"] = timepoint;
+        self.addDoc(doc);
       })
     .define_method(
       "lr_a",
@@ -489,6 +499,13 @@ void Init_ext()
         }
         return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
       })
+    .define_method(
+      "_add_doc",
+      *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<tomoto::Float> metadata) {
+        auto doc = buildDoc(words);
+        doc.misc["metadata"] = metadata;
+        self.addDoc(doc);
+      })
     .define_method(
       "degrees",
       *[](tomoto::IGDMRModel& self) {
@@ -643,7 +660,9 @@ void Init_ext()
     .define_method(
       "_add_doc",
       *[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
-        self.addDoc(words, delimiter);
+        auto doc = buildDoc(words);
+        doc.misc["delimiter"] = delimiter;
+        self.addDoc(doc);
       })
     .define_method(
       "alpha_g",
@@ -708,7 +727,9 @@ void Init_ext()
     .define_method(
       "_add_doc",
       *[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
-        self.addDoc(words, labels);
+        auto doc = buildDoc(words);
+        doc.misc["labels"] = labels;
+        self.addDoc(doc);
       })
     .define_method(
       "topics_per_label",
@@ -728,7 +749,9 @@ void Init_ext()
     .define_method(
       "_add_doc",
       *[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
-        self.addDoc(words, labels);
+        auto doc = buildDoc(words);
+        doc.misc["labels"] = labels;
+        self.addDoc(doc);
       })
     .define_method(
       "latent_topics",
@@ -753,7 +776,9 @@ void Init_ext()
     .define_method(
       "_add_doc",
       *[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
-        self.addDoc(words, y);
+        auto doc = buildDoc(words);
+        doc.misc["y"] = y;
+        self.addDoc(doc);
       })
     .define_method(
       "f",

data/ext/tomoto/extconf.rb CHANGED

@@ -23,9 +23,10 @@ ext = File.expand_path(".", __dir__)
 tomoto = File.expand_path("../../vendor/tomotopy/src/TopicModel", __dir__)
 eigen = File.expand_path("../../vendor/eigen", __dir__)
 eigen_rand = File.expand_path("../../vendor/EigenRand", __dir__)
+variant = File.expand_path("../../vendor/variant/include", __dir__)
 $srcs = Dir["{#{ext},#{tomoto}}/*.cpp"]
-$INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand}"
+$INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand} -I#{variant}"
 $VPATH << tomoto
 create_makefile("tomoto/ext")

data/lib/tomoto/dmr.rb CHANGED

@@ -9,7 +9,7 @@ module Tomoto
     end
     def add_doc(doc, metadata: "")
-      _add_doc(prepare_doc(doc), [metadata])
+      _add_doc(prepare_doc(doc), metadata)
     end
     def lambdas

data/lib/tomoto/gdmr.rb CHANGED

@@ -9,7 +9,7 @@ module Tomoto
     end
     def add_doc(doc, metadata: [])
-      _add_doc(prepare_doc(doc), metadata.map(&:to_s))
+      _add_doc(prepare_doc(doc), metadata)
     end
   end
 end

data/lib/tomoto/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Tomoto
-  VERSION = "0.1.2"
+  VERSION = "0.1.3"
 end

data/vendor/tomotopy/LICENSE CHANGED

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2019
+Copyright (c) 2019, bab2min
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

data/vendor/tomotopy/README.kr.rst CHANGED

@@ -35,7 +35,7 @@ tomotopy 란?
 더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
-tomotopy의 가장 최신버전은 0.9.1 입니다.
+tomotopy의 가장 최신버전은 0.10.0 입니다.
 시작하기
 ---------------
@@ -255,6 +255,17 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 역사
 -------
+* 0.10.0 (2020-12-19)
+    * `tomotopy.utils.Corpus`와 `tomotopy.LDAModel.docs` 간의 인터페이스가 통일되었습니다. 이제 동일한 방법으로 코퍼스 내의 문헌들에 접근할 수 있습니다.
+    * `tomotopy.utils.Corpus`의 __getitem__이 개선되었습니다. int 타입 인덱싱뿐만 아니라 Iterable[int]나 slicing를 이용한 다중 인덱싱, uid를 이용한 인덱싱 등이 제공됩니다.
+    * `tomotopy.utils.Corpus.extract_ngrams`와 `tomotopy.utils.Corpus.concat_ngrams`이 추가되었습니다. PMI를 이용해 코퍼스 내에서 자동으로 n-gram collocation을 발견해 한 단어로 합치는 기능을 수행합니다.
+    * `tomotopy.LDAModel.add_corpus`가 추가되었고, `tomotopy.LDAModel.infer`가 Raw 코퍼스를 입력으로 받을 수 있게 되었습니다.
+    * `tomotopy.coherence` 모듈이 추가되었습니다. 생성된 토픽 모델의 coherence를 계산하는 기능을 담당합니다.
+    * `tomotopy.label.FoRelevance`에 window_size 파라미터가 추가되었습니다.
+    * `tomotopy.HDPModel` 학습 시 종종 NaN이 발생하는 문제를 해결했습니다.
+    * 이제 Python3.9를 지원합니다.
+    * py-cpuinfo에 대한 의존성이 제거되고, 모듈 로딩속도가 개선되었습니다.
 * 0.9.1 (2020-08-08)
     * 0.9.0 버전의 메모리 누수 문제가 해결되었습니다.
     * `tomotopy.CTModel.summary()`가 잘못된 결과를 출력하는 문제가 해결되었습니다.
@@ -277,7 +288,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 * 0.8.2 (2020-07-14)
     * `tomotopy.DTModel.num_timepoints`와 `tomotopy.DTModel.num_docs_by_timepoint` 프로퍼티가 추가되었습니다.
-    * `seed`가 동일해서 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.
+    * `seed`가 동일해도 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.
 * 0.8.1 (2020-06-08)
     * `tomotopy.LDAModel.used_vocabs`가 잘못된 값을 반환하는 버그가 수정되었습니다.
@@ -302,7 +313,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 * 0.7.0 (2020-04-18)
     * `tomotopy.DTModel`이 추가되었습니다.
     * `tomotopy.utils.Corpus.save`가 제대로 작동하지 않는 버그가 수정되었습니다.
-    * `tomotopy.LDAModel.get_count_vector`가 추가되었습니다.
+    * `tomotopy.Document.get_count_vector`가 추가되었습니다.
     * 리눅스용 바이너리가 manylinux2010 버전으로 변경되었고 이에 따른 최적화가 진행되었습니다.
 * 0.6.2 (2020-03-28)
@@ -373,3 +384,21 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 * 0.1.0 (2019-05-12)
     * **tomotopy**의 최초 버전
+다른 언어용 바인딩
+-------------------
+* Ruby: https://github.com/ankane/tomoto
+포함된 라이브러리들의 라이센스
+-------------------------------
+* Eigen:
+  This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
+  A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
+  The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
+* EigenRand: `MIT License
+  <licenses_bundled/EigenRand>`_
+* Mapbox Variant: `BSD License
+  <licenses_bundled/MapboxVariant>`_

data/vendor/tomotopy/README.rst CHANGED

@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
 Please visit https://bab2min.github.io/tomotopy to see more information.
-The most recent version of tomotopy is 0.9.1.
+The most recent version of tomotopy is 0.10.0.
 Getting Started
 ---------------
@@ -261,6 +261,17 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
 History
 -------
+* 0.10.0 (2020-12-19)
+    * The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
+    * __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
+    * New methods `tomotopy.utils.Corpus.extract_ngrams` and `tomotopy.utils.Corpus.concat_ngrams` were added. They extracts n-gram collocations using PMI and concatenates them into a single words.
+    * A new method `tomotopy.LDAModel.add_corpus` was added, and `tomotopy.LDAModel.infer` can receive corpus as input.
+    * A new module `tomotopy.coherence` was added. It provides the way to calculate coherence of the model.
+    * A paramter `window_size` was added to `tomotopy.label.FoRelevance`.
+    * An issue was fixed where NaN often occurs when training `tomotopy.HDPModel`.
+    * Now Python3.9 is supported.
+    * A dependency to py-cpuinfo was removed and the initializing of the module was improved.
 * 0.9.1 (2020-08-08)
     * Memory leaks of version 0.9.0 was fixed.
     * `tomotopy.CTModel.summary()` was fixed.
@@ -380,3 +391,21 @@ History
 * 0.1.0 (2019-05-12)
     * First version of **tomotopy**
+Bindings for Other Languages
+------------------------------
+* Ruby: https://github.com/ankane/tomoto
+Bundled Libraries and Their License
+------------------------------------
+* Eigen:
+  This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
+  A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
+  The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
+* EigenRand: `MIT License
+  <licenses_bundled/EigenRand>`_
+* Mapbox Variant: `BSD License
+  <licenses_bundled/MapboxVariant>`_

data/vendor/tomotopy/src/Labeling/FoRelevance.cpp CHANGED

@@ -5,161 +5,74 @@
 using namespace tomoto::label;
-namespace std
+class DocWrapper
 {
-	template <>
-	struct hash<pair<tomoto::Vid, tomoto::Vid>>
+	const tomoto::DocumentBase* doc;
+public:
+	DocWrapper(const tomoto::DocumentBase* _doc = nullptr)
+		: doc{ _doc }
 	{
-		size_t operator()(const pair<tomoto::Vid, tomoto::Vid>& k) const
-		{
-			return hash<tomoto::Vid>{}(k.first) ^ hash<tomoto::Vid>{}(k.second);
-		}
-	};
-}
-std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel * tm) const
-{
-	auto& vocabFreqs = tm->getVocabCf();
-	auto& vocabDf = tm->getVocabDf();
-	// counting unigrams & bigrams
-	std::unordered_map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf;
+	}
-	for (size_t i = 0; i < tm->getNumDocs(); ++i)
+	size_t size() const
 	{
-		std::unordered_set<std::pair<Vid, Vid>> uniqBigram;
-		auto doc = tm->getDoc(i);
-		Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
-		for (size_t j = 1; j < doc->words.size(); ++j)
-		{
-			Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
-			if (vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
-			{
-				if (vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
-				{
-					bigramCnt[std::make_pair(prevWord, curWord)]++;
-					uniqBigram.emplace(prevWord, curWord);
-				}
-			}
-			prevWord = curWord;
-		}
-		for (auto& p : uniqBigram) bigramDf[p]++;
+		return doc->words.size();
 	}
-	// counting ngrams
-	std::vector<TrieEx<Vid, size_t>> trieNodes;
-	if (maxLabelLen > 2)
+	tomoto::Vid operator[](size_t idx) const
 	{
-		std::unordered_set<std::pair<Vid, Vid>> validPair;
-		for (auto& p : bigramCnt)
-		{
-			if (p.second >= candMinCnt) validPair.emplace(p.first);
-		}
-		trieNodes.resize(1);
-		auto allocNode = [&]() { return trieNodes.emplace_back(), &trieNodes.back(); };
-		for (size_t i = 0; i < tm->getNumDocs(); ++i)
-		{
-			auto doc = tm->getDoc(i);
-			if (trieNodes.capacity() < trieNodes.size() + doc->words.size() * maxLabelLen)
-			{
-				trieNodes.reserve(std::max(trieNodes.size() + doc->words.size() * maxLabelLen, trieNodes.capacity() * 2));
-			}
-			Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
-			size_t labelLen = 0;
-			auto node = &trieNodes[0];
-			if (vocabFreqs[prevWord] >= candMinCnt)
-			{
-				node = trieNodes[0].makeNext(prevWord, allocNode);
-				node->val++;
-				labelLen = 1;
-			}
-			for (size_t j = 1; j < doc->words.size(); ++j)
-			{
-				Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
+		return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
+	}
+};
-				if (vocabFreqs[curWord] < candMinCnt)
-				{
-					node = &trieNodes[0];
-					labelLen = 0;
-				}
-				else
-				{
-					if (labelLen >= maxLabelLen)
-					{
-						node = node->getFail();
-						labelLen--;
-					}
+class DocIterator
+{
+	const tomoto::ITopicModel* tm;
+	size_t idx;
+public:
+	DocIterator(const tomoto::ITopicModel* _tm = nullptr, size_t _idx = 0)
+		: tm{ _tm }, idx{ _idx }
+	{
+	}
-					if (validPair.count(std::make_pair(prevWord, curWord)))
-					{
-						auto nnode = node->makeNext(curWord, allocNode);
-						node = nnode;
-						do
-						{
-							nnode->val++;
-						} while (nnode = nnode->getFail());
-						labelLen++;
-					}
-					else
-					{
-						node = trieNodes[0].makeNext(curWord, allocNode);
-						node->val++;
-						labelLen = 1;
-					}
-				}
-				prevWord = curWord;
-			}
-		}
+	DocWrapper operator*() const
+	{
+		return { tm->getDoc(idx) };
 	}
-	// calculating PMIs
-	std::vector<Candidate> candidates;
-	for (auto& p : bigramCnt)
+	DocIterator& operator++()
 	{
-		auto& bigram = p.first;
-		if (p.second < candMinCnt) continue;
-		if (bigramDf[bigram] < candMinDf) continue;
-		auto pmi = std::log(p.second * (float)tm->getN()
-			/ vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
-		if (pmi <= 0) continue;
-		candidates.emplace_back(pmi, bigram.first, bigram.second);
+		++idx;
+		return *this;
 	}
-	if (maxLabelLen > 2)
+	bool operator==(const DocIterator& o) const
 	{
-		std::vector<Vid> rkeys;
-		trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
-		{
-			if (rkeys.size() <= 2 || node->val < candMinCnt) return;
-			float n = (float)tm->getN();
-			auto pmi = node->val / n;
-			for (auto k : rkeys)
-			{
-				pmi *= n / vocabFreqs[k];
-			}
-			pmi = std::log(pmi);
-			candidates.emplace_back(pmi, rkeys);
-		}, rkeys);
+		return tm == o.tm && idx == o.idx;
 	}
-	std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
+	bool operator!=(const DocIterator& o) const
 	{
-		return a.score > b.score;
-	});
-	if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
+		return tm != o.tm || idx != o.idx;
+	}
+};
-	for (size_t i = 0; i < vocabDf.size(); ++i)
+std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) const
+{
+	auto& vocabFreqs = tm->getVocabCf();
+	auto& vocabDf = tm->getVocabDf();
+	auto candidates = extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
+		vocabFreqs, vocabDf,
+		candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, -99999.f
+	);
+	if (minLabelLen <= 1)
 	{
-		if (vocabFreqs[i] < candMinCnt) continue;
-		if (vocabDf[i] < candMinDf) continue;
-		candidates.emplace_back(0.f, i);
+		for (size_t i = 0; i < vocabDf.size(); ++i)
+		{
+			if (vocabFreqs[i] < candMinCnt) continue;
+			if (vocabDf[i] < candMinDf) continue;
+			candidates.emplace_back(0.f, i);
+		}
 	}
 	return candidates;
 }
@@ -172,8 +85,7 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
 	auto node = root;
 	for (size_t j = 0; j < doc->words.size(); ++j)
 	{
-		size_t t = doc->wOrder.empty() ? j : doc->wOrder[j];
-		tomoto::Vid curWord = doc->words[t];
+		tomoto::Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
 		if (curWord < tm->getV()) bdf[curWord] = 1;
 		auto nnode = node->getNext(curWord);
 		while (!nnode)
@@ -191,16 +103,15 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
 				// the matched candidate is found
 				if (nnode->val && nnode->val != (size_t)-1)
 				{
-					auto& c = candidates[nnode->val - 1];
 					tomoto::OptionalLock<_lock> lock{ mtx[(nnode->val - 1) % (pool ? pool->getNumWorkers() : 1)] };
+					auto& c = candidates[nnode->val - 1];
 					if (c.name.empty() && !doc->origWordPos.empty())
 					{
 						size_t start = doc->origWordPos[j + 1 - c.w.size()];
 						size_t end = doc->origWordPos[j] + doc->origWordLen[j];
 						c.names[doc->rawStr.substr(start, end - start)]++;
 					}
-					auto& docIds = c.docIds;
-					if (docIds.empty() || docIds.back() != docId) docIds.emplace_back(docId);
+					c.docIds.emplace(docId);
 				}
 			} while (nnode = nnode->getFail());
 		}
@@ -268,7 +179,22 @@ void FoRelevance::estimateContexts()
 		wordTopicDist.col(i) = Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ dist.data(), (Eigen::Index)dist.size() };
 	}
-	auto calcScores = [&](CandidateEx& c)
+	size_t totDocCnt = 0;
+	if (windowSize == (size_t)-1)
+	{
+		totDocCnt = tm->getNumDocs();
+	}
+	else
+	{
+		for (size_t i = 0; i < tm->getNumDocs(); ++i)
+		{
+			size_t s = tm->getDoc(i)->words.size();
+			if (s <= windowSize) totDocCnt += 1;
+			else totDocCnt += s - windowSize + 1;
+		}
+	}
+	auto calcScores = [&](CandidateEx& c, size_t windowSize)
 	{
 		if (c.docIds.size() < candMinDf) return;
 		if (c.name.empty() && !c.names.empty())
@@ -284,20 +210,80 @@ void FoRelevance::estimateContexts()
 			}
 		}
+		size_t docCnt = 0;
 		Eigen::Matrix<Float, -1, 1> wcPMI = Eigen::Matrix<Float, -1, 1>::Zero(this->tm->getV());
 		for (auto& docId : c.docIds)
 		{
 			thread_local Eigen::VectorXi bdf(this->tm->getV());
 			bdf.setZero();
 			auto doc = this->tm->getDoc(docId);
-			for (size_t i = 0; i < doc->words.size(); ++i)
+			if (doc->words.size() <= windowSize)
 			{
-				if (doc->words[i] < this->tm->getV()) bdf[doc->words[i]] = 1;
+				for (size_t i = 0; i < doc->words.size(); ++i)
+				{
+					if (doc->words[i] < this->tm->getV()) bdf[doc->words[i]] = 1;
+				}
+				docCnt++;
+				wcPMI += bdf.template cast<Float>();
+			}
+			else
+			{
+				auto wit = c.w.begin();
+				std::deque<size_t> wpos;
+				for (size_t i = 0; i < windowSize; ++i)
+				{
+					Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
+					if (word < this->tm->getV()) bdf[word]++;
+					if (word == *wit)
+					{
+						if (++wit == c.w.end())
+						{
+							wpos.emplace_back(i + 1);
+							wit = c.w.begin();
+						}
+					}
+					else if (word == c.w[0]) wit = c.w.begin() + 1;
+					else wit = c.w.begin();
+				}
+				if (!wpos.empty())
+				{
+					docCnt++;
+					wcPMI += Eigen::bool2float(bdf.array()).matrix();
+				}
+				for (size_t i = windowSize; i < doc->words.size(); ++i)
+				{
+					Vid oword = doc->words[doc->wOrder.empty() ? (i - windowSize) : doc->wOrder[i - windowSize]];
+					Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
+					if (oword < this->tm->getV()) bdf[oword]--;
+					if (word < this->tm->getV()) bdf[word]++;
+					if (!wpos.empty() && wpos.front() - c.w.size() <= i - windowSize)
+					{
+						wpos.pop_front();
+					}
+					if (word == *wit)
+					{
+						if (++wit == c.w.end())
+						{
+							wpos.emplace_back(i + 1);
+							wit = c.w.begin();
+						}
+					}
+					else if (word == c.w[0]) wit = c.w.begin() + 1;
+					else wit = c.w.begin();
+					if (!wpos.empty())
+					{
+						docCnt++;
+						wcPMI += Eigen::bool2float(bdf.array()).matrix();
+					}
+				}
 			}
-			wcPMI += bdf.cast<Float>();
 		}
 		c.scores = wordTopicDist.transpose() *
-			((wcPMI.array() + smoothing) * this->tm->getNumDocs() / c.docIds.size() / df.cast<Float>()).log().matrix();
+			((wcPMI.array() + smoothing) * totDocCnt / docCnt / df.cast<Float>()).log().matrix();
 	};
 	if (pool)
@@ -311,7 +297,7 @@ void FoRelevance::estimateContexts()
 			{
 				for (size_t i = g; i < candidates.size(); i += groups)
 				{
-					calcScores(candidates[i]);
+					calcScores(candidates[i], windowSize);
 				}
 			}, g));
 		}
@@ -321,7 +307,7 @@ void FoRelevance::estimateContexts()
 	{
 		for (auto& c : candidates)
 		{
-			calcScores(c);
+			calcScores(c, windowSize);
 		}
 	}