tomoto 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +7 -0
  5. data/ext/tomoto/ct.cpp +54 -0
  6. data/ext/tomoto/dmr.cpp +62 -0
  7. data/ext/tomoto/dt.cpp +82 -0
  8. data/ext/tomoto/ext.cpp +27 -773
  9. data/ext/tomoto/gdmr.cpp +34 -0
  10. data/ext/tomoto/hdp.cpp +42 -0
  11. data/ext/tomoto/hlda.cpp +66 -0
  12. data/ext/tomoto/hpa.cpp +27 -0
  13. data/ext/tomoto/lda.cpp +250 -0
  14. data/ext/tomoto/llda.cpp +29 -0
  15. data/ext/tomoto/mglda.cpp +71 -0
  16. data/ext/tomoto/pa.cpp +27 -0
  17. data/ext/tomoto/plda.cpp +29 -0
  18. data/ext/tomoto/slda.cpp +40 -0
  19. data/ext/tomoto/utils.h +84 -0
  20. data/lib/tomoto/tomoto.bundle +0 -0
  21. data/lib/tomoto/tomoto.so +0 -0
  22. data/lib/tomoto/version.rb +1 -1
  23. data/vendor/tomotopy/README.kr.rst +12 -3
  24. data/vendor/tomotopy/README.rst +12 -3
  25. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
  26. data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
  27. data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
  28. data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
  29. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
  30. data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
  31. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
  32. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
  33. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
  34. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
  35. data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
  36. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
  37. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
  38. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
  39. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
  40. data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
  41. data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
  42. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
  43. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
  44. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
  45. data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
  46. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
  47. data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
  48. data/vendor/tomotopy/src/Utils/math.h +8 -4
  49. data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
  50. metadata +24 -60
@@ -0,0 +1,71 @@
1
+ #include <MGLDA.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_mglda(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IMGLDAModel, tomoto::ILDAModel>(m, "MGLDA")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k_g, size_t k_l, size_t t, tomoto::Float alpha_g, tomoto::Float alpha_l, tomoto::Float alpha_mg, tomoto::Float alpha_ml, tomoto::Float eta_g) {
12
+ return tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g);
13
+ })
14
+ .define_method(
15
+ "_add_doc",
16
+ *[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
17
+ auto doc = buildDoc(words);
18
+ doc.misc["delimiter"] = delimiter;
19
+ return self.addDoc(doc);
20
+ })
21
+ .define_method(
22
+ "alpha_g",
23
+ *[](tomoto::IMGLDAModel& self) {
24
+ return self.getAlpha();
25
+ })
26
+ .define_method(
27
+ "alpha_l",
28
+ *[](tomoto::IMGLDAModel& self) {
29
+ return self.getAlphaL();
30
+ })
31
+ .define_method(
32
+ "alpha_mg",
33
+ *[](tomoto::IMGLDAModel& self) {
34
+ return self.getAlphaM();
35
+ })
36
+ .define_method(
37
+ "alpha_ml",
38
+ *[](tomoto::IMGLDAModel& self) {
39
+ return self.getAlphaML();
40
+ })
41
+ .define_method(
42
+ "eta_g",
43
+ *[](tomoto::IMGLDAModel& self) {
44
+ return self.getEta();
45
+ })
46
+ .define_method(
47
+ "eta_l",
48
+ *[](tomoto::IMGLDAModel& self) {
49
+ return self.getEtaL();
50
+ })
51
+ .define_method(
52
+ "gamma",
53
+ *[](tomoto::IMGLDAModel& self) {
54
+ return self.getGamma();
55
+ })
56
+ .define_method(
57
+ "k_g",
58
+ *[](tomoto::IMGLDAModel& self) {
59
+ return self.getK();
60
+ })
61
+ .define_method(
62
+ "k_l",
63
+ *[](tomoto::IMGLDAModel& self) {
64
+ return self.getKL();
65
+ })
66
+ .define_method(
67
+ "t",
68
+ *[](tomoto::IMGLDAModel& self) {
69
+ return self.getT();
70
+ });
71
+ }
data/ext/tomoto/pa.cpp ADDED
@@ -0,0 +1,27 @@
1
+ #include <PA.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_pa(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IPAModel, tomoto::ILDAModel>(m, "PA")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ return tomoto::IPAModel::create((tomoto::TermWeight)tw, k1, k2, alpha, eta, seed);
16
+ })
17
+ .define_method(
18
+ "k1",
19
+ *[](tomoto::IPAModel& self) {
20
+ return self.getK();
21
+ })
22
+ .define_method(
23
+ "k2",
24
+ *[](tomoto::IPAModel& self) {
25
+ return self.getK2();
26
+ });
27
+ }
@@ -0,0 +1,29 @@
1
+ #include <PLDA.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_plda(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(m, "PLDA")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t latent_topics, tomoto::Float alpha, tomoto::Float eta, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, latent_topics, 1, alpha, eta, seed);
16
+ })
17
+ .define_method(
18
+ "_add_doc",
19
+ *[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
20
+ auto doc = buildDoc(words);
21
+ doc.misc["labels"] = labels;
22
+ return self.addDoc(doc);
23
+ })
24
+ .define_method(
25
+ "latent_topics",
26
+ *[](tomoto::IPLDAModel& self) {
27
+ return self.getNumLatentTopics();
28
+ });
29
+ }
@@ -0,0 +1,40 @@
1
+ #include <SLDA.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_slda(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(m, "SLDA")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k, Array rb_vars, tomoto::Float alpha, tomoto::Float eta, std::vector<tomoto::Float> mu, std::vector<tomoto::Float> nu_sq, std::vector<tomoto::Float> glm_param, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ std::vector<tomoto::ISLDAModel::GLM> vars;
16
+ vars.reserve(rb_vars.size());
17
+ for (auto const& v : rb_vars) {
18
+ vars.push_back((tomoto::ISLDAModel::GLM) from_ruby<int>(v));
19
+ }
20
+ return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, k, vars, alpha, eta, mu, nu_sq, glm_param, seed);
21
+ })
22
+ .define_method(
23
+ "_add_doc",
24
+ *[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
25
+ auto doc = buildDoc(words);
26
+ doc.misc["y"] = y;
27
+ return self.addDoc(doc);
28
+ })
29
+ .define_method(
30
+ "f",
31
+ *[](tomoto::ISLDAModel& self) {
32
+ return self.getF();
33
+ })
34
+ .define_method(
35
+ "_var_type",
36
+ *[](tomoto::ISLDAModel& self, size_t var_id) {
37
+ if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
38
+ return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
39
+ });
40
+ }
@@ -0,0 +1,84 @@
1
+ #pragma once
2
+
3
+ #include <rice/Array.hpp>
4
+
5
+ using Rice::Array;
6
+ using Rice::Object;
7
+
8
+ template<>
9
+ inline
10
+ Object to_ruby<std::vector<tomoto::Float>>(std::vector<tomoto::Float> const & x)
11
+ {
12
+ Array res;
13
+ for (auto const& v : x) {
14
+ res.push(v);
15
+ }
16
+ return res;
17
+ }
18
+
19
+ template<>
20
+ inline
21
+ Object to_ruby<std::vector<uint32_t>>(std::vector<uint32_t> const & x)
22
+ {
23
+ Array res;
24
+ for (auto const& v : x) {
25
+ res.push(v);
26
+ }
27
+ return res;
28
+ }
29
+
30
+ template<>
31
+ inline
32
+ Object to_ruby<std::vector<uint64_t>>(std::vector<uint64_t> const & x)
33
+ {
34
+ Array res;
35
+ for (auto const& v : x) {
36
+ res.push(v);
37
+ }
38
+ return res;
39
+ }
40
+
41
+ template<>
42
+ inline
43
+ std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
44
+ {
45
+ Array a = Array(x);
46
+ std::vector<std::string> res;
47
+ res.reserve(a.size());
48
+ for (auto const& v : a) {
49
+ res.push_back(from_ruby<std::string>(v));
50
+ }
51
+ return res;
52
+ }
53
+
54
+ template<>
55
+ inline
56
+ std::vector<tomoto::Float> from_ruby<std::vector<tomoto::Float>>(Object x)
57
+ {
58
+ Array a = Array(x);
59
+ std::vector<tomoto::Float> res;
60
+ res.reserve(a.size());
61
+ for (auto const& v : a) {
62
+ res.push_back(from_ruby<tomoto::Float>(v));
63
+ }
64
+ return res;
65
+ }
66
+
67
+ template<>
68
+ inline
69
+ std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
70
+ {
71
+ Array a = Array(x);
72
+ std::vector<uint64_t> res;
73
+ res.reserve(a.size());
74
+ for (auto const& v : a) {
75
+ res.push_back(from_ruby<uint64_t>(v));
76
+ }
77
+ return res;
78
+ }
79
+
80
+ inline tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
81
+ tomoto::RawDoc doc;
82
+ doc.rawWords = words;
83
+ return doc;
84
+ }
Binary file
Binary file
@@ -1,3 +1,3 @@
1
1
  module Tomoto
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -35,7 +35,7 @@ tomotopy 란?
35
35
 
36
36
  더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
37
37
 
38
- tomotopy의 가장 최신버전은 0.10.0 입니다.
38
+ tomotopy의 가장 최신버전은 0.10.2 입니다.
39
39
 
40
40
  시작하기
41
41
  ---------------
@@ -245,7 +245,7 @@ LDA모델로 1000회 iteration을 수행시 걸리는 시간을 초 단위로
245
245
 
246
246
  예제 코드
247
247
  ---------
248
- tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/master/examples/ 를 확인하시길 바랍니다.
248
+ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/main/examples/ 를 확인하시길 바랍니다.
249
249
 
250
250
  예제 코드에서 사용했던 데이터 파일은 https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view 에서 다운받을 수 있습니다.
251
251
 
@@ -255,6 +255,16 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
255
255
 
256
256
  역사
257
257
  -------
258
+ * 0.10.2 (2021-02-16)
259
+ * `tomotopy.CTModel.train`가 큰 K값에 대해 실패하는 문제가 수정되었습니다.
260
+ * `tomotopy.utils.Corpus`가 `uid`값을 잃는 문제가 수정되었습니다.
261
+
262
+ * 0.10.1 (2021-02-14)
263
+ * `tomotopy.utils.Corpus.extract_ngrams`에 빈 문헌을 입력시 발생하던 에러를 수정했습니다.
264
+ * `tomotopy.LDAModel.infer`가 올바른 입력에도 예외를 발생시키던 문제를 수정했습니다.
265
+ * `tomotopy.HLDAModel.infer`가 잘못된 `tomotopy.Document.path` 값을 생성하는 문제를 수정했습니다.
266
+ * `tomotopy.HLDAModel.train`에 새로운 파라미터 `freeze_topics`가 추가되었습니다. 이를 통해 학습 시 신규 토픽 생성 여부를 조정할 수 있습니다.
267
+
258
268
  * 0.10.0 (2020-12-19)
259
269
  * `tomotopy.utils.Corpus`와 `tomotopy.LDAModel.docs` 간의 인터페이스가 통일되었습니다. 이제 동일한 방법으로 코퍼스 내의 문헌들에 접근할 수 있습니다.
260
270
  * `tomotopy.utils.Corpus`의 __getitem__이 개선되었습니다. int 타입 인덱싱뿐만 아니라 Iterable[int]나 slicing를 이용한 다중 인덱싱, uid를 이용한 인덱싱 등이 제공됩니다.
@@ -387,7 +397,6 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
387
397
 
388
398
  다른 언어용 바인딩
389
399
  -------------------
390
-
391
400
  * Ruby: https://github.com/ankane/tomoto
392
401
 
393
402
  포함된 라이브러리들의 라이센스
@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
36
36
 
37
37
  Please visit https://bab2min.github.io/tomotopy to see more information.
38
38
 
39
- The most recent version of tomotopy is 0.10.0.
39
+ The most recent version of tomotopy is 0.10.2.
40
40
 
41
41
  Getting Started
42
42
  ---------------
@@ -250,7 +250,7 @@ See `word_prior_example` in `example.py` for more details.
250
250
 
251
251
  Examples
252
252
  --------
253
- You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/master/examples/ .
253
+ You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/main/examples/ .
254
254
 
255
255
  You can also get the data file used in the example code at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view .
256
256
 
@@ -261,6 +261,16 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
261
261
 
262
262
  History
263
263
  -------
264
+ * 0.10.2 (2021-02-16)
265
+ * An issue was fixed where `tomotopy.CTModel.train` fails with large K.
266
+ * An issue was fixed where `tomotopy.utils.Corpus` loses their `uid` values.
267
+
268
+ * 0.10.1 (2021-02-14)
269
+ * An issue was fixed where `tomotopy.utils.Corpus.extract_ngrams` craches with empty input.
270
+ * An issue was fixed where `tomotopy.LDAModel.infer` raises exception with valid input.
271
+ * An issue was fixed where `tomotopy.HLDAModel.infer` generates wrong `tomotopy.Document.path`.
272
+ * Since a new parameter `freeze_topics` for `tomotopy.HLDAModel.train` was added, you can control whether to create a new topic or not when training.
273
+
264
274
  * 0.10.0 (2020-12-19)
265
275
  * The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
266
276
  * __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
@@ -394,7 +404,6 @@ History
394
404
 
395
405
  Bindings for Other Languages
396
406
  ------------------------------
397
-
398
407
  * Ruby: https://github.com/ankane/tomoto
399
408
 
400
409
  Bundled Libraries and Their License
@@ -2,6 +2,7 @@
2
2
  #include <numeric>
3
3
 
4
4
  #include "FoRelevance.h"
5
+ #include "Phraser.hpp"
5
6
 
6
7
  using namespace tomoto::label;
7
8
 
@@ -23,6 +24,26 @@ public:
23
24
  {
24
25
  return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
25
26
  }
27
+
28
+ auto begin() const -> decltype(doc->words.begin())
29
+ {
30
+ return doc->words.begin();
31
+ }
32
+
33
+ auto end() const -> decltype(doc->words.end())
34
+ {
35
+ return doc->words.end();
36
+ }
37
+
38
+ auto rbegin() const -> decltype(doc->words.rbegin())
39
+ {
40
+ return doc->words.rbegin();
41
+ }
42
+
43
+ auto rend() const -> decltype(doc->words.rend())
44
+ {
45
+ return doc->words.rend();
46
+ }
26
47
  };
27
48
 
28
49
  class DocIterator
@@ -61,9 +82,10 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
61
82
  {
62
83
  auto& vocabFreqs = tm->getVocabCf();
63
84
  auto& vocabDf = tm->getVocabDf();
64
- auto candidates = extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
85
+ auto candidates = phraser::extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
65
86
  vocabFreqs, vocabDf,
66
- candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, -99999.f
87
+ candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, 0.f,
88
+ normalized
67
89
  );
68
90
  if (minLabelLen <= 1)
69
91
  {
@@ -77,6 +99,29 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
77
99
  return candidates;
78
100
  }
79
101
 
102
+
103
+ std::vector<Candidate> tomoto::label::PMIBEExtractor::extract(const ITopicModel* tm) const
104
+ {
105
+ auto& vocabFreqs = tm->getVocabCf();
106
+ auto& vocabDf = tm->getVocabDf();
107
+ auto candidates = phraser::extractPMIBENgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
108
+ vocabFreqs, vocabDf,
109
+ candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates,
110
+ 0.f, 0.f
111
+ );
112
+ if (minLabelLen <= 1)
113
+ {
114
+ for (size_t i = 0; i < vocabDf.size(); ++i)
115
+ {
116
+ if (vocabFreqs[i] < candMinCnt) continue;
117
+ if (vocabDf[i] < candMinDf) continue;
118
+ candidates.emplace_back(0.f, i);
119
+ }
120
+ }
121
+ return candidates;
122
+ }
123
+
124
+
80
125
  template<bool _lock>
81
126
  const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::DocumentBase* doc, const tomoto::Trie<tomoto::Vid, size_t>* root)
82
127
  {
@@ -4,6 +4,7 @@
4
4
  #include "Labeler.h"
5
5
  #include "../Utils/EigenAddonOps.hpp"
6
6
  #include "../Utils/Trie.hpp"
7
+ #include "../Utils/ThreadPool.hpp"
7
8
 
8
9
  /*
9
10
  Implementation of First-order Relevance for topic labeling by bab2min
@@ -16,166 +17,35 @@ namespace tomoto
16
17
  {
17
18
  namespace label
18
19
  {
19
- template<typename _DocIter, typename _Freqs>
20
- std::vector<Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
21
- _Freqs&& vocabFreqs, _Freqs&& vocabDf,
22
- size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore)
20
+ class PMIExtractor : public IExtractor
23
21
  {
24
- struct vvhash
25
- {
26
- size_t operator()(const std::pair<Vid, Vid>& k) const
27
- {
28
- return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
29
- }
30
- };
31
-
32
- // counting unigrams & bigrams
33
- std::unordered_map<std::pair<Vid, Vid>, size_t, vvhash> bigramCnt, bigramDf;
34
-
35
- for(auto docIt = docBegin; docIt != docEnd; ++docIt)
36
- {
37
- std::unordered_set<std::pair<Vid, Vid>, vvhash> uniqBigram;
38
- auto doc = *docIt;
39
- Vid prevWord = doc[0];
40
- for (size_t j = 1; j < doc.size(); ++j)
41
- {
42
- Vid curWord = doc[j];
43
- if (curWord != non_vocab_id && vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
44
- {
45
- if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
46
- {
47
- bigramCnt[std::make_pair(prevWord, curWord)]++;
48
- uniqBigram.emplace(prevWord, curWord);
49
- }
50
- }
51
- prevWord = curWord;
52
- }
53
-
54
- for (auto& p : uniqBigram) bigramDf[p]++;
55
- }
56
-
57
-
58
- // counting ngrams
59
- std::vector<TrieEx<Vid, size_t>> trieNodes;
60
-
61
- if (maxNgrams > 2)
62
- {
63
- std::unordered_set<std::pair<Vid, Vid>, vvhash> validPair;
64
- for (auto& p : bigramCnt)
65
- {
66
- if (p.second >= candMinCnt) validPair.emplace(p.first);
67
- }
68
-
69
- trieNodes.resize(1);
70
- auto allocNode = [&]() { return trieNodes.emplace_back(), & trieNodes.back(); };
71
-
72
- for (auto docIt = docBegin; docIt != docEnd; ++docIt)
73
- {
74
- auto doc = *docIt;
75
- if (trieNodes.capacity() < trieNodes.size() + doc.size() * maxNgrams)
76
- {
77
- trieNodes.reserve(std::max(trieNodes.size() + doc.size() * maxNgrams, trieNodes.capacity() * 2));
78
- }
79
-
80
- Vid prevWord = doc[0];
81
- size_t labelLen = 0;
82
- auto node = &trieNodes[0];
83
- if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt)
84
- {
85
- node = trieNodes[0].makeNext(prevWord, allocNode);
86
- node->val++;
87
- labelLen = 1;
88
- }
89
-
90
- for (size_t j = 1; j < doc.size(); ++j)
91
- {
92
- Vid curWord = doc[j];
93
-
94
- if (curWord != non_vocab_id && vocabFreqs[curWord] < candMinCnt)
95
- {
96
- node = &trieNodes[0];
97
- labelLen = 0;
98
- }
99
- else
100
- {
101
- if (labelLen >= maxNgrams)
102
- {
103
- node = node->getFail();
104
- labelLen--;
105
- }
106
-
107
- if (validPair.count(std::make_pair(prevWord, curWord)))
108
- {
109
- auto nnode = node->makeNext(curWord, allocNode);
110
- node = nnode;
111
- do
112
- {
113
- nnode->val++;
114
- } while (nnode = nnode->getFail());
115
- labelLen++;
116
- }
117
- else
118
- {
119
- node = trieNodes[0].makeNext(curWord, allocNode);
120
- node->val++;
121
- labelLen = 1;
122
- }
123
- }
124
- prevWord = curWord;
125
- }
126
- }
127
- }
128
-
129
- float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
130
-
131
- // calculating PMIs
132
- std::vector<Candidate> candidates;
133
- for (auto& p : bigramCnt)
134
- {
135
- auto& bigram = p.first;
136
- if (p.second < candMinCnt) continue;
137
- if (bigramDf[bigram] < candMinDf) continue;
138
- auto pmi = std::log(p.second * totN
139
- / vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
140
- if (pmi <= 0) continue;
141
- candidates.emplace_back(pmi, bigram.first, bigram.second);
142
- }
143
-
144
- if (maxNgrams > 2)
22
+ size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
23
+ bool normalized;
24
+ public:
25
+ PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
26
+ size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000,
27
+ bool _normalized = false
28
+ )
29
+ : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf },
30
+ minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen },
31
+ maxCandidates{ _maxCandidates }, normalized{ _normalized }
145
32
  {
146
- std::vector<Vid> rkeys;
147
- trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
148
- {
149
- if (rkeys.size() <= 2 || rkeys.size() < minNgrams || node->val < candMinCnt) return;
150
- auto pmi = node->val / totN;
151
- for (auto k : rkeys)
152
- {
153
- pmi *= totN / vocabFreqs[k];
154
- }
155
- pmi = std::log(pmi);
156
- if (pmi < minScore) return;
157
- candidates.emplace_back(pmi, rkeys);
158
- }, rkeys);
159
33
  }
160
34
 
161
- std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
162
- {
163
- return a.score > b.score;
164
- });
165
- if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
166
- return candidates;
167
- }
168
-
35
+ std::vector<Candidate> extract(const ITopicModel* tm) const override;
36
+ };
169
37
 
170
- class PMIExtractor : public IExtractor
38
+ class PMIBEExtractor : public IExtractor
171
39
  {
172
40
  size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
173
41
  public:
174
- PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
175
- : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen}, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
42
+ PMIBEExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
43
+ size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000
44
+ )
45
+ : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
176
46
  {
177
47
  }
178
-
48
+
179
49
  std::vector<Candidate> extract(const ITopicModel* tm) const override;
180
50
  };
181
51
 
@@ -212,7 +82,7 @@ namespace tomoto
212
82
 
213
83
  public:
214
84
  template<typename _Iter>
215
- FoRelevance(const ITopicModel* _tm,
85
+ FoRelevance(const ITopicModel* _tm,
216
86
  _Iter candFirst, _Iter candEnd,
217
87
  size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f,
218
88
  size_t _windowSize = (size_t)-1,