tomoto 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +7 -0
  5. data/ext/tomoto/ct.cpp +54 -0
  6. data/ext/tomoto/dmr.cpp +62 -0
  7. data/ext/tomoto/dt.cpp +82 -0
  8. data/ext/tomoto/ext.cpp +27 -773
  9. data/ext/tomoto/gdmr.cpp +34 -0
  10. data/ext/tomoto/hdp.cpp +42 -0
  11. data/ext/tomoto/hlda.cpp +66 -0
  12. data/ext/tomoto/hpa.cpp +27 -0
  13. data/ext/tomoto/lda.cpp +250 -0
  14. data/ext/tomoto/llda.cpp +29 -0
  15. data/ext/tomoto/mglda.cpp +71 -0
  16. data/ext/tomoto/pa.cpp +27 -0
  17. data/ext/tomoto/plda.cpp +29 -0
  18. data/ext/tomoto/slda.cpp +40 -0
  19. data/ext/tomoto/utils.h +84 -0
  20. data/lib/tomoto/tomoto.bundle +0 -0
  21. data/lib/tomoto/tomoto.so +0 -0
  22. data/lib/tomoto/version.rb +1 -1
  23. data/vendor/tomotopy/README.kr.rst +12 -3
  24. data/vendor/tomotopy/README.rst +12 -3
  25. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
  26. data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
  27. data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
  28. data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
  29. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
  30. data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
  31. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
  32. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
  33. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
  34. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
  35. data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
  36. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
  37. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
  38. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
  39. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
  40. data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
  41. data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
  42. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
  43. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
  44. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
  45. data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
  46. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
  47. data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
  48. data/vendor/tomotopy/src/Utils/math.h +8 -4
  49. data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
  50. metadata +24 -60
@@ -0,0 +1,71 @@
1
+ #include <MGLDA.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_mglda(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IMGLDAModel, tomoto::ILDAModel>(m, "MGLDA")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k_g, size_t k_l, size_t t, tomoto::Float alpha_g, tomoto::Float alpha_l, tomoto::Float alpha_mg, tomoto::Float alpha_ml, tomoto::Float eta_g) {
12
+ return tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g);
13
+ })
14
+ .define_method(
15
+ "_add_doc",
16
+ *[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
17
+ auto doc = buildDoc(words);
18
+ doc.misc["delimiter"] = delimiter;
19
+ return self.addDoc(doc);
20
+ })
21
+ .define_method(
22
+ "alpha_g",
23
+ *[](tomoto::IMGLDAModel& self) {
24
+ return self.getAlpha();
25
+ })
26
+ .define_method(
27
+ "alpha_l",
28
+ *[](tomoto::IMGLDAModel& self) {
29
+ return self.getAlphaL();
30
+ })
31
+ .define_method(
32
+ "alpha_mg",
33
+ *[](tomoto::IMGLDAModel& self) {
34
+ return self.getAlphaM();
35
+ })
36
+ .define_method(
37
+ "alpha_ml",
38
+ *[](tomoto::IMGLDAModel& self) {
39
+ return self.getAlphaML();
40
+ })
41
+ .define_method(
42
+ "eta_g",
43
+ *[](tomoto::IMGLDAModel& self) {
44
+ return self.getEta();
45
+ })
46
+ .define_method(
47
+ "eta_l",
48
+ *[](tomoto::IMGLDAModel& self) {
49
+ return self.getEtaL();
50
+ })
51
+ .define_method(
52
+ "gamma",
53
+ *[](tomoto::IMGLDAModel& self) {
54
+ return self.getGamma();
55
+ })
56
+ .define_method(
57
+ "k_g",
58
+ *[](tomoto::IMGLDAModel& self) {
59
+ return self.getK();
60
+ })
61
+ .define_method(
62
+ "k_l",
63
+ *[](tomoto::IMGLDAModel& self) {
64
+ return self.getKL();
65
+ })
66
+ .define_method(
67
+ "t",
68
+ *[](tomoto::IMGLDAModel& self) {
69
+ return self.getT();
70
+ });
71
+ }
data/ext/tomoto/pa.cpp ADDED
@@ -0,0 +1,27 @@
1
+ #include <PA.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_pa(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IPAModel, tomoto::ILDAModel>(m, "PA")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ return tomoto::IPAModel::create((tomoto::TermWeight)tw, k1, k2, alpha, eta, seed);
16
+ })
17
+ .define_method(
18
+ "k1",
19
+ *[](tomoto::IPAModel& self) {
20
+ return self.getK();
21
+ })
22
+ .define_method(
23
+ "k2",
24
+ *[](tomoto::IPAModel& self) {
25
+ return self.getK2();
26
+ });
27
+ }
@@ -0,0 +1,29 @@
1
+ #include <PLDA.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_plda(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(m, "PLDA")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t latent_topics, tomoto::Float alpha, tomoto::Float eta, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, latent_topics, 1, alpha, eta, seed);
16
+ })
17
+ .define_method(
18
+ "_add_doc",
19
+ *[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
20
+ auto doc = buildDoc(words);
21
+ doc.misc["labels"] = labels;
22
+ return self.addDoc(doc);
23
+ })
24
+ .define_method(
25
+ "latent_topics",
26
+ *[](tomoto::IPLDAModel& self) {
27
+ return self.getNumLatentTopics();
28
+ });
29
+ }
@@ -0,0 +1,40 @@
1
+ #include <SLDA.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_slda(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(m, "SLDA")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k, Array rb_vars, tomoto::Float alpha, tomoto::Float eta, std::vector<tomoto::Float> mu, std::vector<tomoto::Float> nu_sq, std::vector<tomoto::Float> glm_param, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ std::vector<tomoto::ISLDAModel::GLM> vars;
16
+ vars.reserve(rb_vars.size());
17
+ for (auto const& v : rb_vars) {
18
+ vars.push_back((tomoto::ISLDAModel::GLM) from_ruby<int>(v));
19
+ }
20
+ return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, k, vars, alpha, eta, mu, nu_sq, glm_param, seed);
21
+ })
22
+ .define_method(
23
+ "_add_doc",
24
+ *[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
25
+ auto doc = buildDoc(words);
26
+ doc.misc["y"] = y;
27
+ return self.addDoc(doc);
28
+ })
29
+ .define_method(
30
+ "f",
31
+ *[](tomoto::ISLDAModel& self) {
32
+ return self.getF();
33
+ })
34
+ .define_method(
35
+ "_var_type",
36
+ *[](tomoto::ISLDAModel& self, size_t var_id) {
37
+ if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
38
+ return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
39
+ });
40
+ }
@@ -0,0 +1,84 @@
1
+ #pragma once
2
+
3
+ #include <rice/Array.hpp>
4
+
5
+ using Rice::Array;
6
+ using Rice::Object;
7
+
8
+ template<>
9
+ inline
10
+ Object to_ruby<std::vector<tomoto::Float>>(std::vector<tomoto::Float> const & x)
11
+ {
12
+ Array res;
13
+ for (auto const& v : x) {
14
+ res.push(v);
15
+ }
16
+ return res;
17
+ }
18
+
19
+ template<>
20
+ inline
21
+ Object to_ruby<std::vector<uint32_t>>(std::vector<uint32_t> const & x)
22
+ {
23
+ Array res;
24
+ for (auto const& v : x) {
25
+ res.push(v);
26
+ }
27
+ return res;
28
+ }
29
+
30
+ template<>
31
+ inline
32
+ Object to_ruby<std::vector<uint64_t>>(std::vector<uint64_t> const & x)
33
+ {
34
+ Array res;
35
+ for (auto const& v : x) {
36
+ res.push(v);
37
+ }
38
+ return res;
39
+ }
40
+
41
+ template<>
42
+ inline
43
+ std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
44
+ {
45
+ Array a = Array(x);
46
+ std::vector<std::string> res;
47
+ res.reserve(a.size());
48
+ for (auto const& v : a) {
49
+ res.push_back(from_ruby<std::string>(v));
50
+ }
51
+ return res;
52
+ }
53
+
54
+ template<>
55
+ inline
56
+ std::vector<tomoto::Float> from_ruby<std::vector<tomoto::Float>>(Object x)
57
+ {
58
+ Array a = Array(x);
59
+ std::vector<tomoto::Float> res;
60
+ res.reserve(a.size());
61
+ for (auto const& v : a) {
62
+ res.push_back(from_ruby<tomoto::Float>(v));
63
+ }
64
+ return res;
65
+ }
66
+
67
+ template<>
68
+ inline
69
+ std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
70
+ {
71
+ Array a = Array(x);
72
+ std::vector<uint64_t> res;
73
+ res.reserve(a.size());
74
+ for (auto const& v : a) {
75
+ res.push_back(from_ruby<uint64_t>(v));
76
+ }
77
+ return res;
78
+ }
79
+
80
+ inline tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
81
+ tomoto::RawDoc doc;
82
+ doc.rawWords = words;
83
+ return doc;
84
+ }
Binary file
Binary file
@@ -1,3 +1,3 @@
1
1
  module Tomoto
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -35,7 +35,7 @@ tomotopy 란?
35
35
 
36
36
  더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
37
37
 
38
- tomotopy의 가장 최신버전은 0.10.0 입니다.
38
+ tomotopy의 가장 최신버전은 0.10.2 입니다.
39
39
 
40
40
  시작하기
41
41
  ---------------
@@ -245,7 +245,7 @@ LDA모델로 1000회 iteration을 수행시 걸리는 시간을 초 단위로
245
245
 
246
246
  예제 코드
247
247
  ---------
248
- tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/master/examples/ 를 확인하시길 바랍니다.
248
+ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/main/examples/ 를 확인하시길 바랍니다.
249
249
 
250
250
  예제 코드에서 사용했던 데이터 파일은 https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view 에서 다운받을 수 있습니다.
251
251
 
@@ -255,6 +255,16 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
255
255
 
256
256
  역사
257
257
  -------
258
+ * 0.10.2 (2021-02-16)
259
+ * `tomotopy.CTModel.train`가 큰 K값에 대해 실패하는 문제가 수정되었습니다.
260
+ * `tomotopy.utils.Corpus`가 `uid`값을 잃는 문제가 수정되었습니다.
261
+
262
+ * 0.10.1 (2021-02-14)
263
+ * `tomotopy.utils.Corpus.extract_ngrams`에 빈 문헌을 입력시 발생하던 에러를 수정했습니다.
264
+ * `tomotopy.LDAModel.infer`가 올바른 입력에도 예외를 발생시키던 문제를 수정했습니다.
265
+ * `tomotopy.HLDAModel.infer`가 잘못된 `tomotopy.Document.path` 값을 생성하는 문제를 수정했습니다.
266
+ * `tomotopy.HLDAModel.train`에 새로운 파라미터 `freeze_topics`가 추가되었습니다. 이를 통해 학습 시 신규 토픽 생성 여부를 조정할 수 있습니다.
267
+
258
268
  * 0.10.0 (2020-12-19)
259
269
  * `tomotopy.utils.Corpus`와 `tomotopy.LDAModel.docs` 간의 인터페이스가 통일되었습니다. 이제 동일한 방법으로 코퍼스 내의 문헌들에 접근할 수 있습니다.
260
270
  * `tomotopy.utils.Corpus`의 __getitem__이 개선되었습니다. int 타입 인덱싱뿐만 아니라 Iterable[int]나 slicing를 이용한 다중 인덱싱, uid를 이용한 인덱싱 등이 제공됩니다.
@@ -387,7 +397,6 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
387
397
 
388
398
  다른 언어용 바인딩
389
399
  -------------------
390
-
391
400
  * Ruby: https://github.com/ankane/tomoto
392
401
 
393
402
  포함된 라이브러리들의 라이센스
@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
36
36
 
37
37
  Please visit https://bab2min.github.io/tomotopy to see more information.
38
38
 
39
- The most recent version of tomotopy is 0.10.0.
39
+ The most recent version of tomotopy is 0.10.2.
40
40
 
41
41
  Getting Started
42
42
  ---------------
@@ -250,7 +250,7 @@ See `word_prior_example` in `example.py` for more details.
250
250
 
251
251
  Examples
252
252
  --------
253
- You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/master/examples/ .
253
+ You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/main/examples/ .
254
254
 
255
255
  You can also get the data file used in the example code at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view .
256
256
 
@@ -261,6 +261,16 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
261
261
 
262
262
  History
263
263
  -------
264
+ * 0.10.2 (2021-02-16)
265
+ * An issue was fixed where `tomotopy.CTModel.train` fails with large K.
266
+ * An issue was fixed where `tomotopy.utils.Corpus` loses their `uid` values.
267
+
268
+ * 0.10.1 (2021-02-14)
269
+ * An issue was fixed where `tomotopy.utils.Corpus.extract_ngrams` craches with empty input.
270
+ * An issue was fixed where `tomotopy.LDAModel.infer` raises exception with valid input.
271
+ * An issue was fixed where `tomotopy.HLDAModel.infer` generates wrong `tomotopy.Document.path`.
272
+ * Since a new parameter `freeze_topics` for `tomotopy.HLDAModel.train` was added, you can control whether to create a new topic or not when training.
273
+
264
274
  * 0.10.0 (2020-12-19)
265
275
  * The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
266
276
  * __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
@@ -394,7 +404,6 @@ History
394
404
 
395
405
  Bindings for Other Languages
396
406
  ------------------------------
397
-
398
407
  * Ruby: https://github.com/ankane/tomoto
399
408
 
400
409
  Bundled Libraries and Their License
@@ -2,6 +2,7 @@
2
2
  #include <numeric>
3
3
 
4
4
  #include "FoRelevance.h"
5
+ #include "Phraser.hpp"
5
6
 
6
7
  using namespace tomoto::label;
7
8
 
@@ -23,6 +24,26 @@ public:
23
24
  {
24
25
  return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
25
26
  }
27
+
28
+ auto begin() const -> decltype(doc->words.begin())
29
+ {
30
+ return doc->words.begin();
31
+ }
32
+
33
+ auto end() const -> decltype(doc->words.end())
34
+ {
35
+ return doc->words.end();
36
+ }
37
+
38
+ auto rbegin() const -> decltype(doc->words.rbegin())
39
+ {
40
+ return doc->words.rbegin();
41
+ }
42
+
43
+ auto rend() const -> decltype(doc->words.rend())
44
+ {
45
+ return doc->words.rend();
46
+ }
26
47
  };
27
48
 
28
49
  class DocIterator
@@ -61,9 +82,10 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
61
82
  {
62
83
  auto& vocabFreqs = tm->getVocabCf();
63
84
  auto& vocabDf = tm->getVocabDf();
64
- auto candidates = extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
85
+ auto candidates = phraser::extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
65
86
  vocabFreqs, vocabDf,
66
- candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, -99999.f
87
+ candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, 0.f,
88
+ normalized
67
89
  );
68
90
  if (minLabelLen <= 1)
69
91
  {
@@ -77,6 +99,29 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
77
99
  return candidates;
78
100
  }
79
101
 
102
+
103
+ std::vector<Candidate> tomoto::label::PMIBEExtractor::extract(const ITopicModel* tm) const
104
+ {
105
+ auto& vocabFreqs = tm->getVocabCf();
106
+ auto& vocabDf = tm->getVocabDf();
107
+ auto candidates = phraser::extractPMIBENgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
108
+ vocabFreqs, vocabDf,
109
+ candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates,
110
+ 0.f, 0.f
111
+ );
112
+ if (minLabelLen <= 1)
113
+ {
114
+ for (size_t i = 0; i < vocabDf.size(); ++i)
115
+ {
116
+ if (vocabFreqs[i] < candMinCnt) continue;
117
+ if (vocabDf[i] < candMinDf) continue;
118
+ candidates.emplace_back(0.f, i);
119
+ }
120
+ }
121
+ return candidates;
122
+ }
123
+
124
+
80
125
  template<bool _lock>
81
126
  const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::DocumentBase* doc, const tomoto::Trie<tomoto::Vid, size_t>* root)
82
127
  {
@@ -4,6 +4,7 @@
4
4
  #include "Labeler.h"
5
5
  #include "../Utils/EigenAddonOps.hpp"
6
6
  #include "../Utils/Trie.hpp"
7
+ #include "../Utils/ThreadPool.hpp"
7
8
 
8
9
  /*
9
10
  Implementation of First-order Relevance for topic labeling by bab2min
@@ -16,166 +17,35 @@ namespace tomoto
16
17
  {
17
18
  namespace label
18
19
  {
19
- template<typename _DocIter, typename _Freqs>
20
- std::vector<Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
21
- _Freqs&& vocabFreqs, _Freqs&& vocabDf,
22
- size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore)
20
+ class PMIExtractor : public IExtractor
23
21
  {
24
- struct vvhash
25
- {
26
- size_t operator()(const std::pair<Vid, Vid>& k) const
27
- {
28
- return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
29
- }
30
- };
31
-
32
- // counting unigrams & bigrams
33
- std::unordered_map<std::pair<Vid, Vid>, size_t, vvhash> bigramCnt, bigramDf;
34
-
35
- for(auto docIt = docBegin; docIt != docEnd; ++docIt)
36
- {
37
- std::unordered_set<std::pair<Vid, Vid>, vvhash> uniqBigram;
38
- auto doc = *docIt;
39
- Vid prevWord = doc[0];
40
- for (size_t j = 1; j < doc.size(); ++j)
41
- {
42
- Vid curWord = doc[j];
43
- if (curWord != non_vocab_id && vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
44
- {
45
- if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
46
- {
47
- bigramCnt[std::make_pair(prevWord, curWord)]++;
48
- uniqBigram.emplace(prevWord, curWord);
49
- }
50
- }
51
- prevWord = curWord;
52
- }
53
-
54
- for (auto& p : uniqBigram) bigramDf[p]++;
55
- }
56
-
57
-
58
- // counting ngrams
59
- std::vector<TrieEx<Vid, size_t>> trieNodes;
60
-
61
- if (maxNgrams > 2)
62
- {
63
- std::unordered_set<std::pair<Vid, Vid>, vvhash> validPair;
64
- for (auto& p : bigramCnt)
65
- {
66
- if (p.second >= candMinCnt) validPair.emplace(p.first);
67
- }
68
-
69
- trieNodes.resize(1);
70
- auto allocNode = [&]() { return trieNodes.emplace_back(), & trieNodes.back(); };
71
-
72
- for (auto docIt = docBegin; docIt != docEnd; ++docIt)
73
- {
74
- auto doc = *docIt;
75
- if (trieNodes.capacity() < trieNodes.size() + doc.size() * maxNgrams)
76
- {
77
- trieNodes.reserve(std::max(trieNodes.size() + doc.size() * maxNgrams, trieNodes.capacity() * 2));
78
- }
79
-
80
- Vid prevWord = doc[0];
81
- size_t labelLen = 0;
82
- auto node = &trieNodes[0];
83
- if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt)
84
- {
85
- node = trieNodes[0].makeNext(prevWord, allocNode);
86
- node->val++;
87
- labelLen = 1;
88
- }
89
-
90
- for (size_t j = 1; j < doc.size(); ++j)
91
- {
92
- Vid curWord = doc[j];
93
-
94
- if (curWord != non_vocab_id && vocabFreqs[curWord] < candMinCnt)
95
- {
96
- node = &trieNodes[0];
97
- labelLen = 0;
98
- }
99
- else
100
- {
101
- if (labelLen >= maxNgrams)
102
- {
103
- node = node->getFail();
104
- labelLen--;
105
- }
106
-
107
- if (validPair.count(std::make_pair(prevWord, curWord)))
108
- {
109
- auto nnode = node->makeNext(curWord, allocNode);
110
- node = nnode;
111
- do
112
- {
113
- nnode->val++;
114
- } while (nnode = nnode->getFail());
115
- labelLen++;
116
- }
117
- else
118
- {
119
- node = trieNodes[0].makeNext(curWord, allocNode);
120
- node->val++;
121
- labelLen = 1;
122
- }
123
- }
124
- prevWord = curWord;
125
- }
126
- }
127
- }
128
-
129
- float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
130
-
131
- // calculating PMIs
132
- std::vector<Candidate> candidates;
133
- for (auto& p : bigramCnt)
134
- {
135
- auto& bigram = p.first;
136
- if (p.second < candMinCnt) continue;
137
- if (bigramDf[bigram] < candMinDf) continue;
138
- auto pmi = std::log(p.second * totN
139
- / vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
140
- if (pmi <= 0) continue;
141
- candidates.emplace_back(pmi, bigram.first, bigram.second);
142
- }
143
-
144
- if (maxNgrams > 2)
22
+ size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
23
+ bool normalized;
24
+ public:
25
+ PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
26
+ size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000,
27
+ bool _normalized = false
28
+ )
29
+ : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf },
30
+ minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen },
31
+ maxCandidates{ _maxCandidates }, normalized{ _normalized }
145
32
  {
146
- std::vector<Vid> rkeys;
147
- trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
148
- {
149
- if (rkeys.size() <= 2 || rkeys.size() < minNgrams || node->val < candMinCnt) return;
150
- auto pmi = node->val / totN;
151
- for (auto k : rkeys)
152
- {
153
- pmi *= totN / vocabFreqs[k];
154
- }
155
- pmi = std::log(pmi);
156
- if (pmi < minScore) return;
157
- candidates.emplace_back(pmi, rkeys);
158
- }, rkeys);
159
33
  }
160
34
 
161
- std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
162
- {
163
- return a.score > b.score;
164
- });
165
- if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
166
- return candidates;
167
- }
168
-
35
+ std::vector<Candidate> extract(const ITopicModel* tm) const override;
36
+ };
169
37
 
170
- class PMIExtractor : public IExtractor
38
+ class PMIBEExtractor : public IExtractor
171
39
  {
172
40
  size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
173
41
  public:
174
- PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
175
- : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen}, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
42
+ PMIBEExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
43
+ size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000
44
+ )
45
+ : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
176
46
  {
177
47
  }
178
-
48
+
179
49
  std::vector<Candidate> extract(const ITopicModel* tm) const override;
180
50
  };
181
51
 
@@ -212,7 +82,7 @@ namespace tomoto
212
82
 
213
83
  public:
214
84
  template<typename _Iter>
215
- FoRelevance(const ITopicModel* _tm,
85
+ FoRelevance(const ITopicModel* _tm,
216
86
  _Iter candFirst, _Iter candEnd,
217
87
  size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f,
218
88
  size_t _windowSize = (size_t)-1,