tomoto 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/LICENSE.txt +1 -1
- data/README.md +7 -0
- data/ext/tomoto/ct.cpp +54 -0
- data/ext/tomoto/dmr.cpp +62 -0
- data/ext/tomoto/dt.cpp +82 -0
- data/ext/tomoto/ext.cpp +27 -773
- data/ext/tomoto/gdmr.cpp +34 -0
- data/ext/tomoto/hdp.cpp +42 -0
- data/ext/tomoto/hlda.cpp +66 -0
- data/ext/tomoto/hpa.cpp +27 -0
- data/ext/tomoto/lda.cpp +250 -0
- data/ext/tomoto/llda.cpp +29 -0
- data/ext/tomoto/mglda.cpp +71 -0
- data/ext/tomoto/pa.cpp +27 -0
- data/ext/tomoto/plda.cpp +29 -0
- data/ext/tomoto/slda.cpp +40 -0
- data/ext/tomoto/utils.h +84 -0
- data/lib/tomoto/tomoto.bundle +0 -0
- data/lib/tomoto/tomoto.so +0 -0
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/README.kr.rst +12 -3
- data/vendor/tomotopy/README.rst +12 -3
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
- data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
- data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
- data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
- data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
- data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
- data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
- data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
- data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
- data/vendor/tomotopy/src/Utils/math.h +8 -4
- data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
- metadata +24 -60
@@ -0,0 +1,71 @@
|
|
1
|
+
#include <MGLDA.h>
|
2
|
+
|
3
|
+
#include <rice/Module.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_mglda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IMGLDAModel, tomoto::ILDAModel>(m, "MGLDA")
|
9
|
+
.define_singleton_method(
|
10
|
+
"_new",
|
11
|
+
*[](size_t tw, size_t k_g, size_t k_l, size_t t, tomoto::Float alpha_g, tomoto::Float alpha_l, tomoto::Float alpha_mg, tomoto::Float alpha_ml, tomoto::Float eta_g) {
|
12
|
+
return tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g);
|
13
|
+
})
|
14
|
+
.define_method(
|
15
|
+
"_add_doc",
|
16
|
+
*[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
|
17
|
+
auto doc = buildDoc(words);
|
18
|
+
doc.misc["delimiter"] = delimiter;
|
19
|
+
return self.addDoc(doc);
|
20
|
+
})
|
21
|
+
.define_method(
|
22
|
+
"alpha_g",
|
23
|
+
*[](tomoto::IMGLDAModel& self) {
|
24
|
+
return self.getAlpha();
|
25
|
+
})
|
26
|
+
.define_method(
|
27
|
+
"alpha_l",
|
28
|
+
*[](tomoto::IMGLDAModel& self) {
|
29
|
+
return self.getAlphaL();
|
30
|
+
})
|
31
|
+
.define_method(
|
32
|
+
"alpha_mg",
|
33
|
+
*[](tomoto::IMGLDAModel& self) {
|
34
|
+
return self.getAlphaM();
|
35
|
+
})
|
36
|
+
.define_method(
|
37
|
+
"alpha_ml",
|
38
|
+
*[](tomoto::IMGLDAModel& self) {
|
39
|
+
return self.getAlphaML();
|
40
|
+
})
|
41
|
+
.define_method(
|
42
|
+
"eta_g",
|
43
|
+
*[](tomoto::IMGLDAModel& self) {
|
44
|
+
return self.getEta();
|
45
|
+
})
|
46
|
+
.define_method(
|
47
|
+
"eta_l",
|
48
|
+
*[](tomoto::IMGLDAModel& self) {
|
49
|
+
return self.getEtaL();
|
50
|
+
})
|
51
|
+
.define_method(
|
52
|
+
"gamma",
|
53
|
+
*[](tomoto::IMGLDAModel& self) {
|
54
|
+
return self.getGamma();
|
55
|
+
})
|
56
|
+
.define_method(
|
57
|
+
"k_g",
|
58
|
+
*[](tomoto::IMGLDAModel& self) {
|
59
|
+
return self.getK();
|
60
|
+
})
|
61
|
+
.define_method(
|
62
|
+
"k_l",
|
63
|
+
*[](tomoto::IMGLDAModel& self) {
|
64
|
+
return self.getKL();
|
65
|
+
})
|
66
|
+
.define_method(
|
67
|
+
"t",
|
68
|
+
*[](tomoto::IMGLDAModel& self) {
|
69
|
+
return self.getT();
|
70
|
+
});
|
71
|
+
}
|
data/ext/tomoto/pa.cpp
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#include <PA.h>
|
2
|
+
|
3
|
+
#include <rice/Module.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_pa(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IPAModel, tomoto::ILDAModel>(m, "PA")
|
9
|
+
.define_singleton_method(
|
10
|
+
"_new",
|
11
|
+
*[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, int seed) {
|
12
|
+
if (seed < 0) {
|
13
|
+
seed = std::random_device{}();
|
14
|
+
}
|
15
|
+
return tomoto::IPAModel::create((tomoto::TermWeight)tw, k1, k2, alpha, eta, seed);
|
16
|
+
})
|
17
|
+
.define_method(
|
18
|
+
"k1",
|
19
|
+
*[](tomoto::IPAModel& self) {
|
20
|
+
return self.getK();
|
21
|
+
})
|
22
|
+
.define_method(
|
23
|
+
"k2",
|
24
|
+
*[](tomoto::IPAModel& self) {
|
25
|
+
return self.getK2();
|
26
|
+
});
|
27
|
+
}
|
data/ext/tomoto/plda.cpp
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#include <PLDA.h>
|
2
|
+
|
3
|
+
#include <rice/Module.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_plda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(m, "PLDA")
|
9
|
+
.define_singleton_method(
|
10
|
+
"_new",
|
11
|
+
*[](size_t tw, size_t latent_topics, tomoto::Float alpha, tomoto::Float eta, int seed) {
|
12
|
+
if (seed < 0) {
|
13
|
+
seed = std::random_device{}();
|
14
|
+
}
|
15
|
+
return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, latent_topics, 1, alpha, eta, seed);
|
16
|
+
})
|
17
|
+
.define_method(
|
18
|
+
"_add_doc",
|
19
|
+
*[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
|
20
|
+
auto doc = buildDoc(words);
|
21
|
+
doc.misc["labels"] = labels;
|
22
|
+
return self.addDoc(doc);
|
23
|
+
})
|
24
|
+
.define_method(
|
25
|
+
"latent_topics",
|
26
|
+
*[](tomoto::IPLDAModel& self) {
|
27
|
+
return self.getNumLatentTopics();
|
28
|
+
});
|
29
|
+
}
|
data/ext/tomoto/slda.cpp
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#include <SLDA.h>
|
2
|
+
|
3
|
+
#include <rice/Module.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_slda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(m, "SLDA")
|
9
|
+
.define_singleton_method(
|
10
|
+
"_new",
|
11
|
+
*[](size_t tw, size_t k, Array rb_vars, tomoto::Float alpha, tomoto::Float eta, std::vector<tomoto::Float> mu, std::vector<tomoto::Float> nu_sq, std::vector<tomoto::Float> glm_param, int seed) {
|
12
|
+
if (seed < 0) {
|
13
|
+
seed = std::random_device{}();
|
14
|
+
}
|
15
|
+
std::vector<tomoto::ISLDAModel::GLM> vars;
|
16
|
+
vars.reserve(rb_vars.size());
|
17
|
+
for (auto const& v : rb_vars) {
|
18
|
+
vars.push_back((tomoto::ISLDAModel::GLM) from_ruby<int>(v));
|
19
|
+
}
|
20
|
+
return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, k, vars, alpha, eta, mu, nu_sq, glm_param, seed);
|
21
|
+
})
|
22
|
+
.define_method(
|
23
|
+
"_add_doc",
|
24
|
+
*[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
|
25
|
+
auto doc = buildDoc(words);
|
26
|
+
doc.misc["y"] = y;
|
27
|
+
return self.addDoc(doc);
|
28
|
+
})
|
29
|
+
.define_method(
|
30
|
+
"f",
|
31
|
+
*[](tomoto::ISLDAModel& self) {
|
32
|
+
return self.getF();
|
33
|
+
})
|
34
|
+
.define_method(
|
35
|
+
"_var_type",
|
36
|
+
*[](tomoto::ISLDAModel& self, size_t var_id) {
|
37
|
+
if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
|
38
|
+
return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
|
39
|
+
});
|
40
|
+
}
|
data/ext/tomoto/utils.h
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <rice/Array.hpp>
|
4
|
+
|
5
|
+
using Rice::Array;
|
6
|
+
using Rice::Object;
|
7
|
+
|
8
|
+
template<>
|
9
|
+
inline
|
10
|
+
Object to_ruby<std::vector<tomoto::Float>>(std::vector<tomoto::Float> const & x)
|
11
|
+
{
|
12
|
+
Array res;
|
13
|
+
for (auto const& v : x) {
|
14
|
+
res.push(v);
|
15
|
+
}
|
16
|
+
return res;
|
17
|
+
}
|
18
|
+
|
19
|
+
template<>
|
20
|
+
inline
|
21
|
+
Object to_ruby<std::vector<uint32_t>>(std::vector<uint32_t> const & x)
|
22
|
+
{
|
23
|
+
Array res;
|
24
|
+
for (auto const& v : x) {
|
25
|
+
res.push(v);
|
26
|
+
}
|
27
|
+
return res;
|
28
|
+
}
|
29
|
+
|
30
|
+
template<>
|
31
|
+
inline
|
32
|
+
Object to_ruby<std::vector<uint64_t>>(std::vector<uint64_t> const & x)
|
33
|
+
{
|
34
|
+
Array res;
|
35
|
+
for (auto const& v : x) {
|
36
|
+
res.push(v);
|
37
|
+
}
|
38
|
+
return res;
|
39
|
+
}
|
40
|
+
|
41
|
+
template<>
|
42
|
+
inline
|
43
|
+
std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
|
44
|
+
{
|
45
|
+
Array a = Array(x);
|
46
|
+
std::vector<std::string> res;
|
47
|
+
res.reserve(a.size());
|
48
|
+
for (auto const& v : a) {
|
49
|
+
res.push_back(from_ruby<std::string>(v));
|
50
|
+
}
|
51
|
+
return res;
|
52
|
+
}
|
53
|
+
|
54
|
+
template<>
|
55
|
+
inline
|
56
|
+
std::vector<tomoto::Float> from_ruby<std::vector<tomoto::Float>>(Object x)
|
57
|
+
{
|
58
|
+
Array a = Array(x);
|
59
|
+
std::vector<tomoto::Float> res;
|
60
|
+
res.reserve(a.size());
|
61
|
+
for (auto const& v : a) {
|
62
|
+
res.push_back(from_ruby<tomoto::Float>(v));
|
63
|
+
}
|
64
|
+
return res;
|
65
|
+
}
|
66
|
+
|
67
|
+
template<>
|
68
|
+
inline
|
69
|
+
std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
|
70
|
+
{
|
71
|
+
Array a = Array(x);
|
72
|
+
std::vector<uint64_t> res;
|
73
|
+
res.reserve(a.size());
|
74
|
+
for (auto const& v : a) {
|
75
|
+
res.push_back(from_ruby<uint64_t>(v));
|
76
|
+
}
|
77
|
+
return res;
|
78
|
+
}
|
79
|
+
|
80
|
+
inline tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
|
81
|
+
tomoto::RawDoc doc;
|
82
|
+
doc.rawWords = words;
|
83
|
+
return doc;
|
84
|
+
}
|
Binary file
|
Binary file
|
data/lib/tomoto/version.rb
CHANGED
@@ -35,7 +35,7 @@ tomotopy 란?
|
|
35
35
|
|
36
36
|
더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
|
37
37
|
|
38
|
-
tomotopy의 가장 최신버전은 0.10.
|
38
|
+
tomotopy의 가장 최신버전은 0.10.2 입니다.
|
39
39
|
|
40
40
|
시작하기
|
41
41
|
---------------
|
@@ -245,7 +245,7 @@ LDA모델로 1000회 iteration을 수행시 걸리는 시간을 초 단위로
|
|
245
245
|
|
246
246
|
예제 코드
|
247
247
|
---------
|
248
|
-
tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/
|
248
|
+
tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/main/examples/ 를 확인하시길 바랍니다.
|
249
249
|
|
250
250
|
예제 코드에서 사용했던 데이터 파일은 https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view 에서 다운받을 수 있습니다.
|
251
251
|
|
@@ -255,6 +255,16 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
255
255
|
|
256
256
|
역사
|
257
257
|
-------
|
258
|
+
* 0.10.2 (2021-02-16)
|
259
|
+
* `tomotopy.CTModel.train`가 큰 K값에 대해 실패하는 문제가 수정되었습니다.
|
260
|
+
* `tomotopy.utils.Corpus`가 `uid`값을 잃는 문제가 수정되었습니다.
|
261
|
+
|
262
|
+
* 0.10.1 (2021-02-14)
|
263
|
+
* `tomotopy.utils.Corpus.extract_ngrams`에 빈 문헌을 입력시 발생하던 에러를 수정했습니다.
|
264
|
+
* `tomotopy.LDAModel.infer`가 올바른 입력에도 예외를 발생시키던 문제를 수정했습니다.
|
265
|
+
* `tomotopy.HLDAModel.infer`가 잘못된 `tomotopy.Document.path` 값을 생성하는 문제를 수정했습니다.
|
266
|
+
* `tomotopy.HLDAModel.train`에 새로운 파라미터 `freeze_topics`가 추가되었습니다. 이를 통해 학습 시 신규 토픽 생성 여부를 조정할 수 있습니다.
|
267
|
+
|
258
268
|
* 0.10.0 (2020-12-19)
|
259
269
|
* `tomotopy.utils.Corpus`와 `tomotopy.LDAModel.docs` 간의 인터페이스가 통일되었습니다. 이제 동일한 방법으로 코퍼스 내의 문헌들에 접근할 수 있습니다.
|
260
270
|
* `tomotopy.utils.Corpus`의 __getitem__이 개선되었습니다. int 타입 인덱싱뿐만 아니라 Iterable[int]나 slicing를 이용한 다중 인덱싱, uid를 이용한 인덱싱 등이 제공됩니다.
|
@@ -387,7 +397,6 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
387
397
|
|
388
398
|
다른 언어용 바인딩
|
389
399
|
-------------------
|
390
|
-
|
391
400
|
* Ruby: https://github.com/ankane/tomoto
|
392
401
|
|
393
402
|
포함된 라이브러리들의 라이센스
|
data/vendor/tomotopy/README.rst
CHANGED
@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
|
|
36
36
|
|
37
37
|
Please visit https://bab2min.github.io/tomotopy to see more information.
|
38
38
|
|
39
|
-
The most recent version of tomotopy is 0.10.
|
39
|
+
The most recent version of tomotopy is 0.10.2.
|
40
40
|
|
41
41
|
Getting Started
|
42
42
|
---------------
|
@@ -250,7 +250,7 @@ See `word_prior_example` in `example.py` for more details.
|
|
250
250
|
|
251
251
|
Examples
|
252
252
|
--------
|
253
|
-
You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/
|
253
|
+
You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/main/examples/ .
|
254
254
|
|
255
255
|
You can also get the data file used in the example code at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view .
|
256
256
|
|
@@ -261,6 +261,16 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
|
|
261
261
|
|
262
262
|
History
|
263
263
|
-------
|
264
|
+
* 0.10.2 (2021-02-16)
|
265
|
+
* An issue was fixed where `tomotopy.CTModel.train` fails with large K.
|
266
|
+
* An issue was fixed where `tomotopy.utils.Corpus` loses their `uid` values.
|
267
|
+
|
268
|
+
* 0.10.1 (2021-02-14)
|
269
|
+
* An issue was fixed where `tomotopy.utils.Corpus.extract_ngrams` craches with empty input.
|
270
|
+
* An issue was fixed where `tomotopy.LDAModel.infer` raises exception with valid input.
|
271
|
+
* An issue was fixed where `tomotopy.HLDAModel.infer` generates wrong `tomotopy.Document.path`.
|
272
|
+
* Since a new parameter `freeze_topics` for `tomotopy.HLDAModel.train` was added, you can control whether to create a new topic or not when training.
|
273
|
+
|
264
274
|
* 0.10.0 (2020-12-19)
|
265
275
|
* The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
|
266
276
|
* __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
|
@@ -394,7 +404,6 @@ History
|
|
394
404
|
|
395
405
|
Bindings for Other Languages
|
396
406
|
------------------------------
|
397
|
-
|
398
407
|
* Ruby: https://github.com/ankane/tomoto
|
399
408
|
|
400
409
|
Bundled Libraries and Their License
|
@@ -2,6 +2,7 @@
|
|
2
2
|
#include <numeric>
|
3
3
|
|
4
4
|
#include "FoRelevance.h"
|
5
|
+
#include "Phraser.hpp"
|
5
6
|
|
6
7
|
using namespace tomoto::label;
|
7
8
|
|
@@ -23,6 +24,26 @@ public:
|
|
23
24
|
{
|
24
25
|
return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
|
25
26
|
}
|
27
|
+
|
28
|
+
auto begin() const -> decltype(doc->words.begin())
|
29
|
+
{
|
30
|
+
return doc->words.begin();
|
31
|
+
}
|
32
|
+
|
33
|
+
auto end() const -> decltype(doc->words.end())
|
34
|
+
{
|
35
|
+
return doc->words.end();
|
36
|
+
}
|
37
|
+
|
38
|
+
auto rbegin() const -> decltype(doc->words.rbegin())
|
39
|
+
{
|
40
|
+
return doc->words.rbegin();
|
41
|
+
}
|
42
|
+
|
43
|
+
auto rend() const -> decltype(doc->words.rend())
|
44
|
+
{
|
45
|
+
return doc->words.rend();
|
46
|
+
}
|
26
47
|
};
|
27
48
|
|
28
49
|
class DocIterator
|
@@ -61,9 +82,10 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
|
|
61
82
|
{
|
62
83
|
auto& vocabFreqs = tm->getVocabCf();
|
63
84
|
auto& vocabDf = tm->getVocabDf();
|
64
|
-
auto candidates = extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
|
85
|
+
auto candidates = phraser::extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
|
65
86
|
vocabFreqs, vocabDf,
|
66
|
-
candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates,
|
87
|
+
candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, 0.f,
|
88
|
+
normalized
|
67
89
|
);
|
68
90
|
if (minLabelLen <= 1)
|
69
91
|
{
|
@@ -77,6 +99,29 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
|
|
77
99
|
return candidates;
|
78
100
|
}
|
79
101
|
|
102
|
+
|
103
|
+
std::vector<Candidate> tomoto::label::PMIBEExtractor::extract(const ITopicModel* tm) const
|
104
|
+
{
|
105
|
+
auto& vocabFreqs = tm->getVocabCf();
|
106
|
+
auto& vocabDf = tm->getVocabDf();
|
107
|
+
auto candidates = phraser::extractPMIBENgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
|
108
|
+
vocabFreqs, vocabDf,
|
109
|
+
candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates,
|
110
|
+
0.f, 0.f
|
111
|
+
);
|
112
|
+
if (minLabelLen <= 1)
|
113
|
+
{
|
114
|
+
for (size_t i = 0; i < vocabDf.size(); ++i)
|
115
|
+
{
|
116
|
+
if (vocabFreqs[i] < candMinCnt) continue;
|
117
|
+
if (vocabDf[i] < candMinDf) continue;
|
118
|
+
candidates.emplace_back(0.f, i);
|
119
|
+
}
|
120
|
+
}
|
121
|
+
return candidates;
|
122
|
+
}
|
123
|
+
|
124
|
+
|
80
125
|
template<bool _lock>
|
81
126
|
const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::DocumentBase* doc, const tomoto::Trie<tomoto::Vid, size_t>* root)
|
82
127
|
{
|
@@ -4,6 +4,7 @@
|
|
4
4
|
#include "Labeler.h"
|
5
5
|
#include "../Utils/EigenAddonOps.hpp"
|
6
6
|
#include "../Utils/Trie.hpp"
|
7
|
+
#include "../Utils/ThreadPool.hpp"
|
7
8
|
|
8
9
|
/*
|
9
10
|
Implementation of First-order Relevance for topic labeling by bab2min
|
@@ -16,166 +17,35 @@ namespace tomoto
|
|
16
17
|
{
|
17
18
|
namespace label
|
18
19
|
{
|
19
|
-
|
20
|
-
std::vector<Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
|
21
|
-
_Freqs&& vocabFreqs, _Freqs&& vocabDf,
|
22
|
-
size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore)
|
20
|
+
class PMIExtractor : public IExtractor
|
23
21
|
{
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
for(auto docIt = docBegin; docIt != docEnd; ++docIt)
|
36
|
-
{
|
37
|
-
std::unordered_set<std::pair<Vid, Vid>, vvhash> uniqBigram;
|
38
|
-
auto doc = *docIt;
|
39
|
-
Vid prevWord = doc[0];
|
40
|
-
for (size_t j = 1; j < doc.size(); ++j)
|
41
|
-
{
|
42
|
-
Vid curWord = doc[j];
|
43
|
-
if (curWord != non_vocab_id && vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
|
44
|
-
{
|
45
|
-
if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
|
46
|
-
{
|
47
|
-
bigramCnt[std::make_pair(prevWord, curWord)]++;
|
48
|
-
uniqBigram.emplace(prevWord, curWord);
|
49
|
-
}
|
50
|
-
}
|
51
|
-
prevWord = curWord;
|
52
|
-
}
|
53
|
-
|
54
|
-
for (auto& p : uniqBigram) bigramDf[p]++;
|
55
|
-
}
|
56
|
-
|
57
|
-
|
58
|
-
// counting ngrams
|
59
|
-
std::vector<TrieEx<Vid, size_t>> trieNodes;
|
60
|
-
|
61
|
-
if (maxNgrams > 2)
|
62
|
-
{
|
63
|
-
std::unordered_set<std::pair<Vid, Vid>, vvhash> validPair;
|
64
|
-
for (auto& p : bigramCnt)
|
65
|
-
{
|
66
|
-
if (p.second >= candMinCnt) validPair.emplace(p.first);
|
67
|
-
}
|
68
|
-
|
69
|
-
trieNodes.resize(1);
|
70
|
-
auto allocNode = [&]() { return trieNodes.emplace_back(), & trieNodes.back(); };
|
71
|
-
|
72
|
-
for (auto docIt = docBegin; docIt != docEnd; ++docIt)
|
73
|
-
{
|
74
|
-
auto doc = *docIt;
|
75
|
-
if (trieNodes.capacity() < trieNodes.size() + doc.size() * maxNgrams)
|
76
|
-
{
|
77
|
-
trieNodes.reserve(std::max(trieNodes.size() + doc.size() * maxNgrams, trieNodes.capacity() * 2));
|
78
|
-
}
|
79
|
-
|
80
|
-
Vid prevWord = doc[0];
|
81
|
-
size_t labelLen = 0;
|
82
|
-
auto node = &trieNodes[0];
|
83
|
-
if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt)
|
84
|
-
{
|
85
|
-
node = trieNodes[0].makeNext(prevWord, allocNode);
|
86
|
-
node->val++;
|
87
|
-
labelLen = 1;
|
88
|
-
}
|
89
|
-
|
90
|
-
for (size_t j = 1; j < doc.size(); ++j)
|
91
|
-
{
|
92
|
-
Vid curWord = doc[j];
|
93
|
-
|
94
|
-
if (curWord != non_vocab_id && vocabFreqs[curWord] < candMinCnt)
|
95
|
-
{
|
96
|
-
node = &trieNodes[0];
|
97
|
-
labelLen = 0;
|
98
|
-
}
|
99
|
-
else
|
100
|
-
{
|
101
|
-
if (labelLen >= maxNgrams)
|
102
|
-
{
|
103
|
-
node = node->getFail();
|
104
|
-
labelLen--;
|
105
|
-
}
|
106
|
-
|
107
|
-
if (validPair.count(std::make_pair(prevWord, curWord)))
|
108
|
-
{
|
109
|
-
auto nnode = node->makeNext(curWord, allocNode);
|
110
|
-
node = nnode;
|
111
|
-
do
|
112
|
-
{
|
113
|
-
nnode->val++;
|
114
|
-
} while (nnode = nnode->getFail());
|
115
|
-
labelLen++;
|
116
|
-
}
|
117
|
-
else
|
118
|
-
{
|
119
|
-
node = trieNodes[0].makeNext(curWord, allocNode);
|
120
|
-
node->val++;
|
121
|
-
labelLen = 1;
|
122
|
-
}
|
123
|
-
}
|
124
|
-
prevWord = curWord;
|
125
|
-
}
|
126
|
-
}
|
127
|
-
}
|
128
|
-
|
129
|
-
float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
|
130
|
-
|
131
|
-
// calculating PMIs
|
132
|
-
std::vector<Candidate> candidates;
|
133
|
-
for (auto& p : bigramCnt)
|
134
|
-
{
|
135
|
-
auto& bigram = p.first;
|
136
|
-
if (p.second < candMinCnt) continue;
|
137
|
-
if (bigramDf[bigram] < candMinDf) continue;
|
138
|
-
auto pmi = std::log(p.second * totN
|
139
|
-
/ vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
|
140
|
-
if (pmi <= 0) continue;
|
141
|
-
candidates.emplace_back(pmi, bigram.first, bigram.second);
|
142
|
-
}
|
143
|
-
|
144
|
-
if (maxNgrams > 2)
|
22
|
+
size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
|
23
|
+
bool normalized;
|
24
|
+
public:
|
25
|
+
PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
|
26
|
+
size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000,
|
27
|
+
bool _normalized = false
|
28
|
+
)
|
29
|
+
: candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf },
|
30
|
+
minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen },
|
31
|
+
maxCandidates{ _maxCandidates }, normalized{ _normalized }
|
145
32
|
{
|
146
|
-
std::vector<Vid> rkeys;
|
147
|
-
trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
|
148
|
-
{
|
149
|
-
if (rkeys.size() <= 2 || rkeys.size() < minNgrams || node->val < candMinCnt) return;
|
150
|
-
auto pmi = node->val / totN;
|
151
|
-
for (auto k : rkeys)
|
152
|
-
{
|
153
|
-
pmi *= totN / vocabFreqs[k];
|
154
|
-
}
|
155
|
-
pmi = std::log(pmi);
|
156
|
-
if (pmi < minScore) return;
|
157
|
-
candidates.emplace_back(pmi, rkeys);
|
158
|
-
}, rkeys);
|
159
33
|
}
|
160
34
|
|
161
|
-
std::
|
162
|
-
|
163
|
-
return a.score > b.score;
|
164
|
-
});
|
165
|
-
if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
|
166
|
-
return candidates;
|
167
|
-
}
|
168
|
-
|
35
|
+
std::vector<Candidate> extract(const ITopicModel* tm) const override;
|
36
|
+
};
|
169
37
|
|
170
|
-
class
|
38
|
+
class PMIBEExtractor : public IExtractor
|
171
39
|
{
|
172
40
|
size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
|
173
41
|
public:
|
174
|
-
|
175
|
-
|
42
|
+
PMIBEExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
|
43
|
+
size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000
|
44
|
+
)
|
45
|
+
: candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
|
176
46
|
{
|
177
47
|
}
|
178
|
-
|
48
|
+
|
179
49
|
std::vector<Candidate> extract(const ITopicModel* tm) const override;
|
180
50
|
};
|
181
51
|
|
@@ -212,7 +82,7 @@ namespace tomoto
|
|
212
82
|
|
213
83
|
public:
|
214
84
|
template<typename _Iter>
|
215
|
-
FoRelevance(const ITopicModel* _tm,
|
85
|
+
FoRelevance(const ITopicModel* _tm,
|
216
86
|
_Iter candFirst, _Iter candEnd,
|
217
87
|
size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f,
|
218
88
|
size_t _windowSize = (size_t)-1,
|