tomoto 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/LICENSE.txt +1 -1
- data/README.md +7 -0
- data/ext/tomoto/ct.cpp +54 -0
- data/ext/tomoto/dmr.cpp +62 -0
- data/ext/tomoto/dt.cpp +82 -0
- data/ext/tomoto/ext.cpp +27 -773
- data/ext/tomoto/gdmr.cpp +34 -0
- data/ext/tomoto/hdp.cpp +42 -0
- data/ext/tomoto/hlda.cpp +66 -0
- data/ext/tomoto/hpa.cpp +27 -0
- data/ext/tomoto/lda.cpp +250 -0
- data/ext/tomoto/llda.cpp +29 -0
- data/ext/tomoto/mglda.cpp +71 -0
- data/ext/tomoto/pa.cpp +27 -0
- data/ext/tomoto/plda.cpp +29 -0
- data/ext/tomoto/slda.cpp +40 -0
- data/ext/tomoto/utils.h +84 -0
- data/lib/tomoto/tomoto.bundle +0 -0
- data/lib/tomoto/tomoto.so +0 -0
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/README.kr.rst +12 -3
- data/vendor/tomotopy/README.rst +12 -3
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
- data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
- data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
- data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
- data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
- data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
- data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
- data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
- data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
- data/vendor/tomotopy/src/Utils/math.h +8 -4
- data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
- metadata +24 -60
@@ -0,0 +1,71 @@
|
|
1
|
+
#include <MGLDA.h>
|
2
|
+
|
3
|
+
#include <rice/Module.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_mglda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IMGLDAModel, tomoto::ILDAModel>(m, "MGLDA")
|
9
|
+
.define_singleton_method(
|
10
|
+
"_new",
|
11
|
+
*[](size_t tw, size_t k_g, size_t k_l, size_t t, tomoto::Float alpha_g, tomoto::Float alpha_l, tomoto::Float alpha_mg, tomoto::Float alpha_ml, tomoto::Float eta_g) {
|
12
|
+
return tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g);
|
13
|
+
})
|
14
|
+
.define_method(
|
15
|
+
"_add_doc",
|
16
|
+
*[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
|
17
|
+
auto doc = buildDoc(words);
|
18
|
+
doc.misc["delimiter"] = delimiter;
|
19
|
+
return self.addDoc(doc);
|
20
|
+
})
|
21
|
+
.define_method(
|
22
|
+
"alpha_g",
|
23
|
+
*[](tomoto::IMGLDAModel& self) {
|
24
|
+
return self.getAlpha();
|
25
|
+
})
|
26
|
+
.define_method(
|
27
|
+
"alpha_l",
|
28
|
+
*[](tomoto::IMGLDAModel& self) {
|
29
|
+
return self.getAlphaL();
|
30
|
+
})
|
31
|
+
.define_method(
|
32
|
+
"alpha_mg",
|
33
|
+
*[](tomoto::IMGLDAModel& self) {
|
34
|
+
return self.getAlphaM();
|
35
|
+
})
|
36
|
+
.define_method(
|
37
|
+
"alpha_ml",
|
38
|
+
*[](tomoto::IMGLDAModel& self) {
|
39
|
+
return self.getAlphaML();
|
40
|
+
})
|
41
|
+
.define_method(
|
42
|
+
"eta_g",
|
43
|
+
*[](tomoto::IMGLDAModel& self) {
|
44
|
+
return self.getEta();
|
45
|
+
})
|
46
|
+
.define_method(
|
47
|
+
"eta_l",
|
48
|
+
*[](tomoto::IMGLDAModel& self) {
|
49
|
+
return self.getEtaL();
|
50
|
+
})
|
51
|
+
.define_method(
|
52
|
+
"gamma",
|
53
|
+
*[](tomoto::IMGLDAModel& self) {
|
54
|
+
return self.getGamma();
|
55
|
+
})
|
56
|
+
.define_method(
|
57
|
+
"k_g",
|
58
|
+
*[](tomoto::IMGLDAModel& self) {
|
59
|
+
return self.getK();
|
60
|
+
})
|
61
|
+
.define_method(
|
62
|
+
"k_l",
|
63
|
+
*[](tomoto::IMGLDAModel& self) {
|
64
|
+
return self.getKL();
|
65
|
+
})
|
66
|
+
.define_method(
|
67
|
+
"t",
|
68
|
+
*[](tomoto::IMGLDAModel& self) {
|
69
|
+
return self.getT();
|
70
|
+
});
|
71
|
+
}
|
data/ext/tomoto/pa.cpp
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#include <PA.h>
|
2
|
+
|
3
|
+
#include <rice/Module.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_pa(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IPAModel, tomoto::ILDAModel>(m, "PA")
|
9
|
+
.define_singleton_method(
|
10
|
+
"_new",
|
11
|
+
*[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, int seed) {
|
12
|
+
if (seed < 0) {
|
13
|
+
seed = std::random_device{}();
|
14
|
+
}
|
15
|
+
return tomoto::IPAModel::create((tomoto::TermWeight)tw, k1, k2, alpha, eta, seed);
|
16
|
+
})
|
17
|
+
.define_method(
|
18
|
+
"k1",
|
19
|
+
*[](tomoto::IPAModel& self) {
|
20
|
+
return self.getK();
|
21
|
+
})
|
22
|
+
.define_method(
|
23
|
+
"k2",
|
24
|
+
*[](tomoto::IPAModel& self) {
|
25
|
+
return self.getK2();
|
26
|
+
});
|
27
|
+
}
|
data/ext/tomoto/plda.cpp
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#include <PLDA.h>
|
2
|
+
|
3
|
+
#include <rice/Module.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_plda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(m, "PLDA")
|
9
|
+
.define_singleton_method(
|
10
|
+
"_new",
|
11
|
+
*[](size_t tw, size_t latent_topics, tomoto::Float alpha, tomoto::Float eta, int seed) {
|
12
|
+
if (seed < 0) {
|
13
|
+
seed = std::random_device{}();
|
14
|
+
}
|
15
|
+
return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, latent_topics, 1, alpha, eta, seed);
|
16
|
+
})
|
17
|
+
.define_method(
|
18
|
+
"_add_doc",
|
19
|
+
*[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
|
20
|
+
auto doc = buildDoc(words);
|
21
|
+
doc.misc["labels"] = labels;
|
22
|
+
return self.addDoc(doc);
|
23
|
+
})
|
24
|
+
.define_method(
|
25
|
+
"latent_topics",
|
26
|
+
*[](tomoto::IPLDAModel& self) {
|
27
|
+
return self.getNumLatentTopics();
|
28
|
+
});
|
29
|
+
}
|
data/ext/tomoto/slda.cpp
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#include <SLDA.h>
|
2
|
+
|
3
|
+
#include <rice/Module.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_slda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(m, "SLDA")
|
9
|
+
.define_singleton_method(
|
10
|
+
"_new",
|
11
|
+
*[](size_t tw, size_t k, Array rb_vars, tomoto::Float alpha, tomoto::Float eta, std::vector<tomoto::Float> mu, std::vector<tomoto::Float> nu_sq, std::vector<tomoto::Float> glm_param, int seed) {
|
12
|
+
if (seed < 0) {
|
13
|
+
seed = std::random_device{}();
|
14
|
+
}
|
15
|
+
std::vector<tomoto::ISLDAModel::GLM> vars;
|
16
|
+
vars.reserve(rb_vars.size());
|
17
|
+
for (auto const& v : rb_vars) {
|
18
|
+
vars.push_back((tomoto::ISLDAModel::GLM) from_ruby<int>(v));
|
19
|
+
}
|
20
|
+
return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, k, vars, alpha, eta, mu, nu_sq, glm_param, seed);
|
21
|
+
})
|
22
|
+
.define_method(
|
23
|
+
"_add_doc",
|
24
|
+
*[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
|
25
|
+
auto doc = buildDoc(words);
|
26
|
+
doc.misc["y"] = y;
|
27
|
+
return self.addDoc(doc);
|
28
|
+
})
|
29
|
+
.define_method(
|
30
|
+
"f",
|
31
|
+
*[](tomoto::ISLDAModel& self) {
|
32
|
+
return self.getF();
|
33
|
+
})
|
34
|
+
.define_method(
|
35
|
+
"_var_type",
|
36
|
+
*[](tomoto::ISLDAModel& self, size_t var_id) {
|
37
|
+
if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
|
38
|
+
return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
|
39
|
+
});
|
40
|
+
}
|
data/ext/tomoto/utils.h
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <rice/Array.hpp>
|
4
|
+
|
5
|
+
using Rice::Array;
|
6
|
+
using Rice::Object;
|
7
|
+
|
8
|
+
template<>
|
9
|
+
inline
|
10
|
+
Object to_ruby<std::vector<tomoto::Float>>(std::vector<tomoto::Float> const & x)
|
11
|
+
{
|
12
|
+
Array res;
|
13
|
+
for (auto const& v : x) {
|
14
|
+
res.push(v);
|
15
|
+
}
|
16
|
+
return res;
|
17
|
+
}
|
18
|
+
|
19
|
+
template<>
|
20
|
+
inline
|
21
|
+
Object to_ruby<std::vector<uint32_t>>(std::vector<uint32_t> const & x)
|
22
|
+
{
|
23
|
+
Array res;
|
24
|
+
for (auto const& v : x) {
|
25
|
+
res.push(v);
|
26
|
+
}
|
27
|
+
return res;
|
28
|
+
}
|
29
|
+
|
30
|
+
template<>
|
31
|
+
inline
|
32
|
+
Object to_ruby<std::vector<uint64_t>>(std::vector<uint64_t> const & x)
|
33
|
+
{
|
34
|
+
Array res;
|
35
|
+
for (auto const& v : x) {
|
36
|
+
res.push(v);
|
37
|
+
}
|
38
|
+
return res;
|
39
|
+
}
|
40
|
+
|
41
|
+
template<>
|
42
|
+
inline
|
43
|
+
std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
|
44
|
+
{
|
45
|
+
Array a = Array(x);
|
46
|
+
std::vector<std::string> res;
|
47
|
+
res.reserve(a.size());
|
48
|
+
for (auto const& v : a) {
|
49
|
+
res.push_back(from_ruby<std::string>(v));
|
50
|
+
}
|
51
|
+
return res;
|
52
|
+
}
|
53
|
+
|
54
|
+
template<>
|
55
|
+
inline
|
56
|
+
std::vector<tomoto::Float> from_ruby<std::vector<tomoto::Float>>(Object x)
|
57
|
+
{
|
58
|
+
Array a = Array(x);
|
59
|
+
std::vector<tomoto::Float> res;
|
60
|
+
res.reserve(a.size());
|
61
|
+
for (auto const& v : a) {
|
62
|
+
res.push_back(from_ruby<tomoto::Float>(v));
|
63
|
+
}
|
64
|
+
return res;
|
65
|
+
}
|
66
|
+
|
67
|
+
template<>
|
68
|
+
inline
|
69
|
+
std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
|
70
|
+
{
|
71
|
+
Array a = Array(x);
|
72
|
+
std::vector<uint64_t> res;
|
73
|
+
res.reserve(a.size());
|
74
|
+
for (auto const& v : a) {
|
75
|
+
res.push_back(from_ruby<uint64_t>(v));
|
76
|
+
}
|
77
|
+
return res;
|
78
|
+
}
|
79
|
+
|
80
|
+
inline tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
|
81
|
+
tomoto::RawDoc doc;
|
82
|
+
doc.rawWords = words;
|
83
|
+
return doc;
|
84
|
+
}
|
Binary file
|
Binary file
|
data/lib/tomoto/version.rb
CHANGED
@@ -35,7 +35,7 @@ tomotopy 란?
|
|
35
35
|
|
36
36
|
더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
|
37
37
|
|
38
|
-
tomotopy의 가장 최신버전은 0.10.
|
38
|
+
tomotopy의 가장 최신버전은 0.10.2 입니다.
|
39
39
|
|
40
40
|
시작하기
|
41
41
|
---------------
|
@@ -245,7 +245,7 @@ LDA모델로 1000회 iteration을 수행시 걸리는 시간을 초 단위로
|
|
245
245
|
|
246
246
|
예제 코드
|
247
247
|
---------
|
248
|
-
tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/
|
248
|
+
tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/main/examples/ 를 확인하시길 바랍니다.
|
249
249
|
|
250
250
|
예제 코드에서 사용했던 데이터 파일은 https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view 에서 다운받을 수 있습니다.
|
251
251
|
|
@@ -255,6 +255,16 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
255
255
|
|
256
256
|
역사
|
257
257
|
-------
|
258
|
+
* 0.10.2 (2021-02-16)
|
259
|
+
* `tomotopy.CTModel.train`가 큰 K값에 대해 실패하는 문제가 수정되었습니다.
|
260
|
+
* `tomotopy.utils.Corpus`가 `uid`값을 잃는 문제가 수정되었습니다.
|
261
|
+
|
262
|
+
* 0.10.1 (2021-02-14)
|
263
|
+
* `tomotopy.utils.Corpus.extract_ngrams`에 빈 문헌을 입력시 발생하던 에러를 수정했습니다.
|
264
|
+
* `tomotopy.LDAModel.infer`가 올바른 입력에도 예외를 발생시키던 문제를 수정했습니다.
|
265
|
+
* `tomotopy.HLDAModel.infer`가 잘못된 `tomotopy.Document.path` 값을 생성하는 문제를 수정했습니다.
|
266
|
+
* `tomotopy.HLDAModel.train`에 새로운 파라미터 `freeze_topics`가 추가되었습니다. 이를 통해 학습 시 신규 토픽 생성 여부를 조정할 수 있습니다.
|
267
|
+
|
258
268
|
* 0.10.0 (2020-12-19)
|
259
269
|
* `tomotopy.utils.Corpus`와 `tomotopy.LDAModel.docs` 간의 인터페이스가 통일되었습니다. 이제 동일한 방법으로 코퍼스 내의 문헌들에 접근할 수 있습니다.
|
260
270
|
* `tomotopy.utils.Corpus`의 __getitem__이 개선되었습니다. int 타입 인덱싱뿐만 아니라 Iterable[int]나 slicing를 이용한 다중 인덱싱, uid를 이용한 인덱싱 등이 제공됩니다.
|
@@ -387,7 +397,6 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
387
397
|
|
388
398
|
다른 언어용 바인딩
|
389
399
|
-------------------
|
390
|
-
|
391
400
|
* Ruby: https://github.com/ankane/tomoto
|
392
401
|
|
393
402
|
포함된 라이브러리들의 라이센스
|
data/vendor/tomotopy/README.rst
CHANGED
@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
|
|
36
36
|
|
37
37
|
Please visit https://bab2min.github.io/tomotopy to see more information.
|
38
38
|
|
39
|
-
The most recent version of tomotopy is 0.10.
|
39
|
+
The most recent version of tomotopy is 0.10.2.
|
40
40
|
|
41
41
|
Getting Started
|
42
42
|
---------------
|
@@ -250,7 +250,7 @@ See `word_prior_example` in `example.py` for more details.
|
|
250
250
|
|
251
251
|
Examples
|
252
252
|
--------
|
253
|
-
You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/
|
253
|
+
You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/main/examples/ .
|
254
254
|
|
255
255
|
You can also get the data file used in the example code at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view .
|
256
256
|
|
@@ -261,6 +261,16 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
|
|
261
261
|
|
262
262
|
History
|
263
263
|
-------
|
264
|
+
* 0.10.2 (2021-02-16)
|
265
|
+
* An issue was fixed where `tomotopy.CTModel.train` fails with large K.
|
266
|
+
* An issue was fixed where `tomotopy.utils.Corpus` loses their `uid` values.
|
267
|
+
|
268
|
+
* 0.10.1 (2021-02-14)
|
269
|
+
* An issue was fixed where `tomotopy.utils.Corpus.extract_ngrams` craches with empty input.
|
270
|
+
* An issue was fixed where `tomotopy.LDAModel.infer` raises exception with valid input.
|
271
|
+
* An issue was fixed where `tomotopy.HLDAModel.infer` generates wrong `tomotopy.Document.path`.
|
272
|
+
* Since a new parameter `freeze_topics` for `tomotopy.HLDAModel.train` was added, you can control whether to create a new topic or not when training.
|
273
|
+
|
264
274
|
* 0.10.0 (2020-12-19)
|
265
275
|
* The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
|
266
276
|
* __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
|
@@ -394,7 +404,6 @@ History
|
|
394
404
|
|
395
405
|
Bindings for Other Languages
|
396
406
|
------------------------------
|
397
|
-
|
398
407
|
* Ruby: https://github.com/ankane/tomoto
|
399
408
|
|
400
409
|
Bundled Libraries and Their License
|
@@ -2,6 +2,7 @@
|
|
2
2
|
#include <numeric>
|
3
3
|
|
4
4
|
#include "FoRelevance.h"
|
5
|
+
#include "Phraser.hpp"
|
5
6
|
|
6
7
|
using namespace tomoto::label;
|
7
8
|
|
@@ -23,6 +24,26 @@ public:
|
|
23
24
|
{
|
24
25
|
return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
|
25
26
|
}
|
27
|
+
|
28
|
+
auto begin() const -> decltype(doc->words.begin())
|
29
|
+
{
|
30
|
+
return doc->words.begin();
|
31
|
+
}
|
32
|
+
|
33
|
+
auto end() const -> decltype(doc->words.end())
|
34
|
+
{
|
35
|
+
return doc->words.end();
|
36
|
+
}
|
37
|
+
|
38
|
+
auto rbegin() const -> decltype(doc->words.rbegin())
|
39
|
+
{
|
40
|
+
return doc->words.rbegin();
|
41
|
+
}
|
42
|
+
|
43
|
+
auto rend() const -> decltype(doc->words.rend())
|
44
|
+
{
|
45
|
+
return doc->words.rend();
|
46
|
+
}
|
26
47
|
};
|
27
48
|
|
28
49
|
class DocIterator
|
@@ -61,9 +82,10 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
|
|
61
82
|
{
|
62
83
|
auto& vocabFreqs = tm->getVocabCf();
|
63
84
|
auto& vocabDf = tm->getVocabDf();
|
64
|
-
auto candidates = extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
|
85
|
+
auto candidates = phraser::extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
|
65
86
|
vocabFreqs, vocabDf,
|
66
|
-
candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates,
|
87
|
+
candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, 0.f,
|
88
|
+
normalized
|
67
89
|
);
|
68
90
|
if (minLabelLen <= 1)
|
69
91
|
{
|
@@ -77,6 +99,29 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
|
|
77
99
|
return candidates;
|
78
100
|
}
|
79
101
|
|
102
|
+
|
103
|
+
std::vector<Candidate> tomoto::label::PMIBEExtractor::extract(const ITopicModel* tm) const
|
104
|
+
{
|
105
|
+
auto& vocabFreqs = tm->getVocabCf();
|
106
|
+
auto& vocabDf = tm->getVocabDf();
|
107
|
+
auto candidates = phraser::extractPMIBENgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
|
108
|
+
vocabFreqs, vocabDf,
|
109
|
+
candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates,
|
110
|
+
0.f, 0.f
|
111
|
+
);
|
112
|
+
if (minLabelLen <= 1)
|
113
|
+
{
|
114
|
+
for (size_t i = 0; i < vocabDf.size(); ++i)
|
115
|
+
{
|
116
|
+
if (vocabFreqs[i] < candMinCnt) continue;
|
117
|
+
if (vocabDf[i] < candMinDf) continue;
|
118
|
+
candidates.emplace_back(0.f, i);
|
119
|
+
}
|
120
|
+
}
|
121
|
+
return candidates;
|
122
|
+
}
|
123
|
+
|
124
|
+
|
80
125
|
template<bool _lock>
|
81
126
|
const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::DocumentBase* doc, const tomoto::Trie<tomoto::Vid, size_t>* root)
|
82
127
|
{
|
@@ -4,6 +4,7 @@
|
|
4
4
|
#include "Labeler.h"
|
5
5
|
#include "../Utils/EigenAddonOps.hpp"
|
6
6
|
#include "../Utils/Trie.hpp"
|
7
|
+
#include "../Utils/ThreadPool.hpp"
|
7
8
|
|
8
9
|
/*
|
9
10
|
Implementation of First-order Relevance for topic labeling by bab2min
|
@@ -16,166 +17,35 @@ namespace tomoto
|
|
16
17
|
{
|
17
18
|
namespace label
|
18
19
|
{
|
19
|
-
|
20
|
-
std::vector<Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
|
21
|
-
_Freqs&& vocabFreqs, _Freqs&& vocabDf,
|
22
|
-
size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore)
|
20
|
+
class PMIExtractor : public IExtractor
|
23
21
|
{
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
for(auto docIt = docBegin; docIt != docEnd; ++docIt)
|
36
|
-
{
|
37
|
-
std::unordered_set<std::pair<Vid, Vid>, vvhash> uniqBigram;
|
38
|
-
auto doc = *docIt;
|
39
|
-
Vid prevWord = doc[0];
|
40
|
-
for (size_t j = 1; j < doc.size(); ++j)
|
41
|
-
{
|
42
|
-
Vid curWord = doc[j];
|
43
|
-
if (curWord != non_vocab_id && vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
|
44
|
-
{
|
45
|
-
if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
|
46
|
-
{
|
47
|
-
bigramCnt[std::make_pair(prevWord, curWord)]++;
|
48
|
-
uniqBigram.emplace(prevWord, curWord);
|
49
|
-
}
|
50
|
-
}
|
51
|
-
prevWord = curWord;
|
52
|
-
}
|
53
|
-
|
54
|
-
for (auto& p : uniqBigram) bigramDf[p]++;
|
55
|
-
}
|
56
|
-
|
57
|
-
|
58
|
-
// counting ngrams
|
59
|
-
std::vector<TrieEx<Vid, size_t>> trieNodes;
|
60
|
-
|
61
|
-
if (maxNgrams > 2)
|
62
|
-
{
|
63
|
-
std::unordered_set<std::pair<Vid, Vid>, vvhash> validPair;
|
64
|
-
for (auto& p : bigramCnt)
|
65
|
-
{
|
66
|
-
if (p.second >= candMinCnt) validPair.emplace(p.first);
|
67
|
-
}
|
68
|
-
|
69
|
-
trieNodes.resize(1);
|
70
|
-
auto allocNode = [&]() { return trieNodes.emplace_back(), & trieNodes.back(); };
|
71
|
-
|
72
|
-
for (auto docIt = docBegin; docIt != docEnd; ++docIt)
|
73
|
-
{
|
74
|
-
auto doc = *docIt;
|
75
|
-
if (trieNodes.capacity() < trieNodes.size() + doc.size() * maxNgrams)
|
76
|
-
{
|
77
|
-
trieNodes.reserve(std::max(trieNodes.size() + doc.size() * maxNgrams, trieNodes.capacity() * 2));
|
78
|
-
}
|
79
|
-
|
80
|
-
Vid prevWord = doc[0];
|
81
|
-
size_t labelLen = 0;
|
82
|
-
auto node = &trieNodes[0];
|
83
|
-
if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt)
|
84
|
-
{
|
85
|
-
node = trieNodes[0].makeNext(prevWord, allocNode);
|
86
|
-
node->val++;
|
87
|
-
labelLen = 1;
|
88
|
-
}
|
89
|
-
|
90
|
-
for (size_t j = 1; j < doc.size(); ++j)
|
91
|
-
{
|
92
|
-
Vid curWord = doc[j];
|
93
|
-
|
94
|
-
if (curWord != non_vocab_id && vocabFreqs[curWord] < candMinCnt)
|
95
|
-
{
|
96
|
-
node = &trieNodes[0];
|
97
|
-
labelLen = 0;
|
98
|
-
}
|
99
|
-
else
|
100
|
-
{
|
101
|
-
if (labelLen >= maxNgrams)
|
102
|
-
{
|
103
|
-
node = node->getFail();
|
104
|
-
labelLen--;
|
105
|
-
}
|
106
|
-
|
107
|
-
if (validPair.count(std::make_pair(prevWord, curWord)))
|
108
|
-
{
|
109
|
-
auto nnode = node->makeNext(curWord, allocNode);
|
110
|
-
node = nnode;
|
111
|
-
do
|
112
|
-
{
|
113
|
-
nnode->val++;
|
114
|
-
} while (nnode = nnode->getFail());
|
115
|
-
labelLen++;
|
116
|
-
}
|
117
|
-
else
|
118
|
-
{
|
119
|
-
node = trieNodes[0].makeNext(curWord, allocNode);
|
120
|
-
node->val++;
|
121
|
-
labelLen = 1;
|
122
|
-
}
|
123
|
-
}
|
124
|
-
prevWord = curWord;
|
125
|
-
}
|
126
|
-
}
|
127
|
-
}
|
128
|
-
|
129
|
-
float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
|
130
|
-
|
131
|
-
// calculating PMIs
|
132
|
-
std::vector<Candidate> candidates;
|
133
|
-
for (auto& p : bigramCnt)
|
134
|
-
{
|
135
|
-
auto& bigram = p.first;
|
136
|
-
if (p.second < candMinCnt) continue;
|
137
|
-
if (bigramDf[bigram] < candMinDf) continue;
|
138
|
-
auto pmi = std::log(p.second * totN
|
139
|
-
/ vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
|
140
|
-
if (pmi <= 0) continue;
|
141
|
-
candidates.emplace_back(pmi, bigram.first, bigram.second);
|
142
|
-
}
|
143
|
-
|
144
|
-
if (maxNgrams > 2)
|
22
|
+
size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
|
23
|
+
bool normalized;
|
24
|
+
public:
|
25
|
+
PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
|
26
|
+
size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000,
|
27
|
+
bool _normalized = false
|
28
|
+
)
|
29
|
+
: candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf },
|
30
|
+
minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen },
|
31
|
+
maxCandidates{ _maxCandidates }, normalized{ _normalized }
|
145
32
|
{
|
146
|
-
std::vector<Vid> rkeys;
|
147
|
-
trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
|
148
|
-
{
|
149
|
-
if (rkeys.size() <= 2 || rkeys.size() < minNgrams || node->val < candMinCnt) return;
|
150
|
-
auto pmi = node->val / totN;
|
151
|
-
for (auto k : rkeys)
|
152
|
-
{
|
153
|
-
pmi *= totN / vocabFreqs[k];
|
154
|
-
}
|
155
|
-
pmi = std::log(pmi);
|
156
|
-
if (pmi < minScore) return;
|
157
|
-
candidates.emplace_back(pmi, rkeys);
|
158
|
-
}, rkeys);
|
159
33
|
}
|
160
34
|
|
161
|
-
std::
|
162
|
-
|
163
|
-
return a.score > b.score;
|
164
|
-
});
|
165
|
-
if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
|
166
|
-
return candidates;
|
167
|
-
}
|
168
|
-
|
35
|
+
std::vector<Candidate> extract(const ITopicModel* tm) const override;
|
36
|
+
};
|
169
37
|
|
170
|
-
class
|
38
|
+
class PMIBEExtractor : public IExtractor
|
171
39
|
{
|
172
40
|
size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
|
173
41
|
public:
|
174
|
-
|
175
|
-
|
42
|
+
PMIBEExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
|
43
|
+
size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000
|
44
|
+
)
|
45
|
+
: candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
|
176
46
|
{
|
177
47
|
}
|
178
|
-
|
48
|
+
|
179
49
|
std::vector<Candidate> extract(const ITopicModel* tm) const override;
|
180
50
|
};
|
181
51
|
|
@@ -212,7 +82,7 @@ namespace tomoto
|
|
212
82
|
|
213
83
|
public:
|
214
84
|
template<typename _Iter>
|
215
|
-
FoRelevance(const ITopicModel* _tm,
|
85
|
+
FoRelevance(const ITopicModel* _tm,
|
216
86
|
_Iter candFirst, _Iter candEnd,
|
217
87
|
size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f,
|
218
88
|
size_t _windowSize = (size_t)-1,
|