tomoto 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/README.md +3 -3
- data/ext/tomoto/ext.cpp +34 -9
- data/ext/tomoto/extconf.rb +2 -1
- data/lib/tomoto/dmr.rb +1 -1
- data/lib/tomoto/gdmr.rb +1 -1
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/LICENSE +1 -1
- data/vendor/tomotopy/README.kr.rst +32 -3
- data/vendor/tomotopy/README.rst +30 -1
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
- data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
- data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
- data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
- data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
- data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
- data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
- data/vendor/tomotopy/src/Utils/math.h +1 -1
- data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
- data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
- data/vendor/variant/LICENSE +25 -0
- data/vendor/variant/LICENSE_1_0.txt +23 -0
- data/vendor/variant/README.md +102 -0
- data/vendor/variant/include/mapbox/optional.hpp +74 -0
- data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
- data/vendor/variant/include/mapbox/variant.hpp +974 -0
- data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
- metadata +15 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 370ff6a569a6e476137cda0db349cf1b22ff0e7d54b0df67500ffbbe46846a86
|
4
|
+
data.tar.gz: 8df2fb740a85f8a51485e992dfacd6bef4d3c185d8d08369c72d93b9bb412f6d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17abc53d6923a1fb24bfd040967944884cd7e1595ff4fb82c7af46a4bfae858875b2819b7e05fd2ff92e7fe17eec1350c551aa98bf88eedb344cde00a780f32a
|
7
|
+
data.tar.gz: 8f504f89679cf072fee04ac5b0319e164a509d140e65dd6f1c77e6fc9ecef252ce4ece0ac80087d0d69ddccf2a5b002910db58bb5212c4c2c7fa5d206b60d422
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
:tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
|
4
4
|
|
5
|
-
[![Build Status](https://
|
5
|
+
[![Build Status](https://github.com/ankane/tomoto/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tomoto/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -19,7 +19,7 @@ It can take 10-20 minutes to compile the extension.
|
|
19
19
|
Train a model
|
20
20
|
|
21
21
|
```ruby
|
22
|
-
model = Tomoto::LDA.new(k:
|
22
|
+
model = Tomoto::LDA.new(k: 2)
|
23
23
|
model.add_doc("text from document one")
|
24
24
|
model.add_doc("text from document two")
|
25
25
|
model.add_doc("text from document three")
|
@@ -98,7 +98,7 @@ If a method or option you need isn’t supported, feel free to open an issue.
|
|
98
98
|
## Examples
|
99
99
|
|
100
100
|
- [LDA](examples/lda_basic.rb)
|
101
|
-
- [HDP](examples/
|
101
|
+
- [HDP](examples/hdp_basic.rb)
|
102
102
|
|
103
103
|
## Tokenization
|
104
104
|
|
data/ext/tomoto/ext.cpp
CHANGED
@@ -96,6 +96,12 @@ std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
|
|
96
96
|
return res;
|
97
97
|
}
|
98
98
|
|
99
|
+
tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
|
100
|
+
tomoto::RawDoc doc;
|
101
|
+
doc.rawWords = words;
|
102
|
+
return doc;
|
103
|
+
}
|
104
|
+
|
99
105
|
extern "C"
|
100
106
|
void Init_ext()
|
101
107
|
{
|
@@ -126,7 +132,7 @@ void Init_ext()
|
|
126
132
|
.define_method(
|
127
133
|
"_add_doc",
|
128
134
|
*[](tomoto::ILDAModel& self, std::vector<std::string> words) {
|
129
|
-
self.addDoc(words);
|
135
|
+
self.addDoc(buildDoc(words));
|
130
136
|
})
|
131
137
|
.define_method(
|
132
138
|
"alpha",
|
@@ -379,8 +385,10 @@ void Init_ext()
|
|
379
385
|
})
|
380
386
|
.define_method(
|
381
387
|
"_add_doc",
|
382
|
-
*[](tomoto::IDMRModel& self, std::vector<std::string> words, std::
|
383
|
-
|
388
|
+
*[](tomoto::IDMRModel& self, std::vector<std::string> words, std::string metadata) {
|
389
|
+
auto doc = buildDoc(words);
|
390
|
+
doc.misc["metadata"] = metadata;
|
391
|
+
self.addDoc(doc);
|
384
392
|
})
|
385
393
|
.define_method(
|
386
394
|
"alpha_epsilon",
|
@@ -433,8 +441,10 @@ void Init_ext()
|
|
433
441
|
})
|
434
442
|
.define_method(
|
435
443
|
"_add_doc",
|
436
|
-
*[](tomoto::IDTModel& self, std::vector<std::string> words,
|
437
|
-
|
444
|
+
*[](tomoto::IDTModel& self, std::vector<std::string> words, uint32_t timepoint) {
|
445
|
+
auto doc = buildDoc(words);
|
446
|
+
doc.misc["timepoint"] = timepoint;
|
447
|
+
self.addDoc(doc);
|
438
448
|
})
|
439
449
|
.define_method(
|
440
450
|
"lr_a",
|
@@ -489,6 +499,13 @@ void Init_ext()
|
|
489
499
|
}
|
490
500
|
return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
|
491
501
|
})
|
502
|
+
.define_method(
|
503
|
+
"_add_doc",
|
504
|
+
*[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<tomoto::Float> metadata) {
|
505
|
+
auto doc = buildDoc(words);
|
506
|
+
doc.misc["metadata"] = metadata;
|
507
|
+
self.addDoc(doc);
|
508
|
+
})
|
492
509
|
.define_method(
|
493
510
|
"degrees",
|
494
511
|
*[](tomoto::IGDMRModel& self) {
|
@@ -643,7 +660,9 @@ void Init_ext()
|
|
643
660
|
.define_method(
|
644
661
|
"_add_doc",
|
645
662
|
*[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
|
646
|
-
|
663
|
+
auto doc = buildDoc(words);
|
664
|
+
doc.misc["delimiter"] = delimiter;
|
665
|
+
self.addDoc(doc);
|
647
666
|
})
|
648
667
|
.define_method(
|
649
668
|
"alpha_g",
|
@@ -708,7 +727,9 @@ void Init_ext()
|
|
708
727
|
.define_method(
|
709
728
|
"_add_doc",
|
710
729
|
*[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
|
711
|
-
|
730
|
+
auto doc = buildDoc(words);
|
731
|
+
doc.misc["labels"] = labels;
|
732
|
+
self.addDoc(doc);
|
712
733
|
})
|
713
734
|
.define_method(
|
714
735
|
"topics_per_label",
|
@@ -728,7 +749,9 @@ void Init_ext()
|
|
728
749
|
.define_method(
|
729
750
|
"_add_doc",
|
730
751
|
*[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
|
731
|
-
|
752
|
+
auto doc = buildDoc(words);
|
753
|
+
doc.misc["labels"] = labels;
|
754
|
+
self.addDoc(doc);
|
732
755
|
})
|
733
756
|
.define_method(
|
734
757
|
"latent_topics",
|
@@ -753,7 +776,9 @@ void Init_ext()
|
|
753
776
|
.define_method(
|
754
777
|
"_add_doc",
|
755
778
|
*[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
|
756
|
-
|
779
|
+
auto doc = buildDoc(words);
|
780
|
+
doc.misc["y"] = y;
|
781
|
+
self.addDoc(doc);
|
757
782
|
})
|
758
783
|
.define_method(
|
759
784
|
"f",
|
data/ext/tomoto/extconf.rb
CHANGED
@@ -23,9 +23,10 @@ ext = File.expand_path(".", __dir__)
|
|
23
23
|
tomoto = File.expand_path("../../vendor/tomotopy/src/TopicModel", __dir__)
|
24
24
|
eigen = File.expand_path("../../vendor/eigen", __dir__)
|
25
25
|
eigen_rand = File.expand_path("../../vendor/EigenRand", __dir__)
|
26
|
+
variant = File.expand_path("../../vendor/variant/include", __dir__)
|
26
27
|
|
27
28
|
$srcs = Dir["{#{ext},#{tomoto}}/*.cpp"]
|
28
|
-
$INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand}"
|
29
|
+
$INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand} -I#{variant}"
|
29
30
|
$VPATH << tomoto
|
30
31
|
|
31
32
|
create_makefile("tomoto/ext")
|
data/lib/tomoto/dmr.rb
CHANGED
data/lib/tomoto/gdmr.rb
CHANGED
data/lib/tomoto/version.rb
CHANGED
data/vendor/tomotopy/LICENSE
CHANGED
@@ -35,7 +35,7 @@ tomotopy 란?
|
|
35
35
|
|
36
36
|
더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
|
37
37
|
|
38
|
-
tomotopy의 가장 최신버전은 0.
|
38
|
+
tomotopy의 가장 최신버전은 0.10.0 입니다.
|
39
39
|
|
40
40
|
시작하기
|
41
41
|
---------------
|
@@ -255,6 +255,17 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
255
255
|
|
256
256
|
역사
|
257
257
|
-------
|
258
|
+
* 0.10.0 (2020-12-19)
|
259
|
+
* `tomotopy.utils.Corpus`와 `tomotopy.LDAModel.docs` 간의 인터페이스가 통일되었습니다. 이제 동일한 방법으로 코퍼스 내의 문헌들에 접근할 수 있습니다.
|
260
|
+
* `tomotopy.utils.Corpus`의 __getitem__이 개선되었습니다. int 타입 인덱싱뿐만 아니라 Iterable[int]나 slicing를 이용한 다중 인덱싱, uid를 이용한 인덱싱 등이 제공됩니다.
|
261
|
+
* `tomotopy.utils.Corpus.extract_ngrams`와 `tomotopy.utils.Corpus.concat_ngrams`이 추가되었습니다. PMI를 이용해 코퍼스 내에서 자동으로 n-gram collocation을 발견해 한 단어로 합치는 기능을 수행합니다.
|
262
|
+
* `tomotopy.LDAModel.add_corpus`가 추가되었고, `tomotopy.LDAModel.infer`가 Raw 코퍼스를 입력으로 받을 수 있게 되었습니다.
|
263
|
+
* `tomotopy.coherence` 모듈이 추가되었습니다. 생성된 토픽 모델의 coherence를 계산하는 기능을 담당합니다.
|
264
|
+
* `tomotopy.label.FoRelevance`에 window_size 파라미터가 추가되었습니다.
|
265
|
+
* `tomotopy.HDPModel` 학습 시 종종 NaN이 발생하는 문제를 해결했습니다.
|
266
|
+
* 이제 Python3.9를 지원합니다.
|
267
|
+
* py-cpuinfo에 대한 의존성이 제거되고, 모듈 로딩속도가 개선되었습니다.
|
268
|
+
|
258
269
|
* 0.9.1 (2020-08-08)
|
259
270
|
* 0.9.0 버전의 메모리 누수 문제가 해결되었습니다.
|
260
271
|
* `tomotopy.CTModel.summary()`가 잘못된 결과를 출력하는 문제가 해결되었습니다.
|
@@ -277,7 +288,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
277
288
|
|
278
289
|
* 0.8.2 (2020-07-14)
|
279
290
|
* `tomotopy.DTModel.num_timepoints`와 `tomotopy.DTModel.num_docs_by_timepoint` 프로퍼티가 추가되었습니다.
|
280
|
-
* `seed`가
|
291
|
+
* `seed`가 동일해도 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.
|
281
292
|
|
282
293
|
* 0.8.1 (2020-06-08)
|
283
294
|
* `tomotopy.LDAModel.used_vocabs`가 잘못된 값을 반환하는 버그가 수정되었습니다.
|
@@ -302,7 +313,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
302
313
|
* 0.7.0 (2020-04-18)
|
303
314
|
* `tomotopy.DTModel`이 추가되었습니다.
|
304
315
|
* `tomotopy.utils.Corpus.save`가 제대로 작동하지 않는 버그가 수정되었습니다.
|
305
|
-
* `tomotopy.
|
316
|
+
* `tomotopy.Document.get_count_vector`가 추가되었습니다.
|
306
317
|
* 리눅스용 바이너리가 manylinux2010 버전으로 변경되었고 이에 따른 최적화가 진행되었습니다.
|
307
318
|
|
308
319
|
* 0.6.2 (2020-03-28)
|
@@ -373,3 +384,21 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
373
384
|
|
374
385
|
* 0.1.0 (2019-05-12)
|
375
386
|
* **tomotopy**의 최초 버전
|
387
|
+
|
388
|
+
다른 언어용 바인딩
|
389
|
+
-------------------
|
390
|
+
|
391
|
+
* Ruby: https://github.com/ankane/tomoto
|
392
|
+
|
393
|
+
포함된 라이브러리들의 라이센스
|
394
|
+
-------------------------------
|
395
|
+
* Eigen:
|
396
|
+
This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
|
397
|
+
A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
|
398
|
+
The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
|
399
|
+
|
400
|
+
* EigenRand: `MIT License
|
401
|
+
<licenses_bundled/EigenRand>`_
|
402
|
+
|
403
|
+
* Mapbox Variant: `BSD License
|
404
|
+
<licenses_bundled/MapboxVariant>`_
|
data/vendor/tomotopy/README.rst
CHANGED
@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
|
|
36
36
|
|
37
37
|
Please visit https://bab2min.github.io/tomotopy to see more information.
|
38
38
|
|
39
|
-
The most recent version of tomotopy is 0.
|
39
|
+
The most recent version of tomotopy is 0.10.0.
|
40
40
|
|
41
41
|
Getting Started
|
42
42
|
---------------
|
@@ -261,6 +261,17 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
|
|
261
261
|
|
262
262
|
History
|
263
263
|
-------
|
264
|
+
* 0.10.0 (2020-12-19)
|
265
|
+
* The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
|
266
|
+
* __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
|
267
|
+
* New methods `tomotopy.utils.Corpus.extract_ngrams` and `tomotopy.utils.Corpus.concat_ngrams` were added. They extracts n-gram collocations using PMI and concatenates them into a single words.
|
268
|
+
* A new method `tomotopy.LDAModel.add_corpus` was added, and `tomotopy.LDAModel.infer` can receive corpus as input.
|
269
|
+
* A new module `tomotopy.coherence` was added. It provides the way to calculate coherence of the model.
|
270
|
+
* A paramter `window_size` was added to `tomotopy.label.FoRelevance`.
|
271
|
+
* An issue was fixed where NaN often occurs when training `tomotopy.HDPModel`.
|
272
|
+
* Now Python3.9 is supported.
|
273
|
+
* A dependency to py-cpuinfo was removed and the initializing of the module was improved.
|
274
|
+
|
264
275
|
* 0.9.1 (2020-08-08)
|
265
276
|
* Memory leaks of version 0.9.0 was fixed.
|
266
277
|
* `tomotopy.CTModel.summary()` was fixed.
|
@@ -380,3 +391,21 @@ History
|
|
380
391
|
|
381
392
|
* 0.1.0 (2019-05-12)
|
382
393
|
* First version of **tomotopy**
|
394
|
+
|
395
|
+
Bindings for Other Languages
|
396
|
+
------------------------------
|
397
|
+
|
398
|
+
* Ruby: https://github.com/ankane/tomoto
|
399
|
+
|
400
|
+
Bundled Libraries and Their License
|
401
|
+
------------------------------------
|
402
|
+
* Eigen:
|
403
|
+
This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
|
404
|
+
A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
|
405
|
+
The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
|
406
|
+
|
407
|
+
* EigenRand: `MIT License
|
408
|
+
<licenses_bundled/EigenRand>`_
|
409
|
+
|
410
|
+
* Mapbox Variant: `BSD License
|
411
|
+
<licenses_bundled/MapboxVariant>`_
|
@@ -5,161 +5,74 @@
|
|
5
5
|
|
6
6
|
using namespace tomoto::label;
|
7
7
|
|
8
|
-
|
8
|
+
class DocWrapper
|
9
9
|
{
|
10
|
-
|
11
|
-
|
10
|
+
const tomoto::DocumentBase* doc;
|
11
|
+
public:
|
12
|
+
DocWrapper(const tomoto::DocumentBase* _doc = nullptr)
|
13
|
+
: doc{ _doc }
|
12
14
|
{
|
13
|
-
|
14
|
-
{
|
15
|
-
return hash<tomoto::Vid>{}(k.first) ^ hash<tomoto::Vid>{}(k.second);
|
16
|
-
}
|
17
|
-
};
|
18
|
-
}
|
19
|
-
|
20
|
-
std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel * tm) const
|
21
|
-
{
|
22
|
-
auto& vocabFreqs = tm->getVocabCf();
|
23
|
-
auto& vocabDf = tm->getVocabDf();
|
24
|
-
|
25
|
-
// counting unigrams & bigrams
|
26
|
-
std::unordered_map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf;
|
15
|
+
}
|
27
16
|
|
28
|
-
|
17
|
+
size_t size() const
|
29
18
|
{
|
30
|
-
|
31
|
-
auto doc = tm->getDoc(i);
|
32
|
-
Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
|
33
|
-
for (size_t j = 1; j < doc->words.size(); ++j)
|
34
|
-
{
|
35
|
-
Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
|
36
|
-
if (vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
|
37
|
-
{
|
38
|
-
if (vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
|
39
|
-
{
|
40
|
-
bigramCnt[std::make_pair(prevWord, curWord)]++;
|
41
|
-
uniqBigram.emplace(prevWord, curWord);
|
42
|
-
}
|
43
|
-
}
|
44
|
-
prevWord = curWord;
|
45
|
-
}
|
46
|
-
|
47
|
-
for (auto& p : uniqBigram) bigramDf[p]++;
|
19
|
+
return doc->words.size();
|
48
20
|
}
|
49
21
|
|
50
|
-
|
51
|
-
// counting ngrams
|
52
|
-
std::vector<TrieEx<Vid, size_t>> trieNodes;
|
53
|
-
|
54
|
-
if (maxLabelLen > 2)
|
22
|
+
tomoto::Vid operator[](size_t idx) const
|
55
23
|
{
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
if (p.second >= candMinCnt) validPair.emplace(p.first);
|
60
|
-
}
|
61
|
-
|
62
|
-
trieNodes.resize(1);
|
63
|
-
auto allocNode = [&]() { return trieNodes.emplace_back(), &trieNodes.back(); };
|
64
|
-
|
65
|
-
for (size_t i = 0; i < tm->getNumDocs(); ++i)
|
66
|
-
{
|
67
|
-
auto doc = tm->getDoc(i);
|
68
|
-
if (trieNodes.capacity() < trieNodes.size() + doc->words.size() * maxLabelLen)
|
69
|
-
{
|
70
|
-
trieNodes.reserve(std::max(trieNodes.size() + doc->words.size() * maxLabelLen, trieNodes.capacity() * 2));
|
71
|
-
}
|
72
|
-
|
73
|
-
Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
|
74
|
-
size_t labelLen = 0;
|
75
|
-
auto node = &trieNodes[0];
|
76
|
-
if (vocabFreqs[prevWord] >= candMinCnt)
|
77
|
-
{
|
78
|
-
node = trieNodes[0].makeNext(prevWord, allocNode);
|
79
|
-
node->val++;
|
80
|
-
labelLen = 1;
|
81
|
-
}
|
82
|
-
|
83
|
-
for (size_t j = 1; j < doc->words.size(); ++j)
|
84
|
-
{
|
85
|
-
Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
|
24
|
+
return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
|
25
|
+
}
|
26
|
+
};
|
86
27
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
node = node->getFail();
|
97
|
-
labelLen--;
|
98
|
-
}
|
28
|
+
class DocIterator
|
29
|
+
{
|
30
|
+
const tomoto::ITopicModel* tm;
|
31
|
+
size_t idx;
|
32
|
+
public:
|
33
|
+
DocIterator(const tomoto::ITopicModel* _tm = nullptr, size_t _idx = 0)
|
34
|
+
: tm{ _tm }, idx{ _idx }
|
35
|
+
{
|
36
|
+
}
|
99
37
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
node = nnode;
|
104
|
-
do
|
105
|
-
{
|
106
|
-
nnode->val++;
|
107
|
-
} while (nnode = nnode->getFail());
|
108
|
-
labelLen++;
|
109
|
-
}
|
110
|
-
else
|
111
|
-
{
|
112
|
-
node = trieNodes[0].makeNext(curWord, allocNode);
|
113
|
-
node->val++;
|
114
|
-
labelLen = 1;
|
115
|
-
}
|
116
|
-
}
|
117
|
-
prevWord = curWord;
|
118
|
-
}
|
119
|
-
}
|
38
|
+
DocWrapper operator*() const
|
39
|
+
{
|
40
|
+
return { tm->getDoc(idx) };
|
120
41
|
}
|
121
42
|
|
122
|
-
|
123
|
-
std::vector<Candidate> candidates;
|
124
|
-
for (auto& p : bigramCnt)
|
43
|
+
DocIterator& operator++()
|
125
44
|
{
|
126
|
-
|
127
|
-
|
128
|
-
if (bigramDf[bigram] < candMinDf) continue;
|
129
|
-
auto pmi = std::log(p.second * (float)tm->getN()
|
130
|
-
/ vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
|
131
|
-
if (pmi <= 0) continue;
|
132
|
-
candidates.emplace_back(pmi, bigram.first, bigram.second);
|
45
|
+
++idx;
|
46
|
+
return *this;
|
133
47
|
}
|
134
48
|
|
135
|
-
|
49
|
+
bool operator==(const DocIterator& o) const
|
136
50
|
{
|
137
|
-
|
138
|
-
trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
|
139
|
-
{
|
140
|
-
if (rkeys.size() <= 2 || node->val < candMinCnt) return;
|
141
|
-
float n = (float)tm->getN();
|
142
|
-
auto pmi = node->val / n;
|
143
|
-
for (auto k : rkeys)
|
144
|
-
{
|
145
|
-
pmi *= n / vocabFreqs[k];
|
146
|
-
}
|
147
|
-
pmi = std::log(pmi);
|
148
|
-
candidates.emplace_back(pmi, rkeys);
|
149
|
-
}, rkeys);
|
51
|
+
return tm == o.tm && idx == o.idx;
|
150
52
|
}
|
151
53
|
|
152
|
-
|
54
|
+
bool operator!=(const DocIterator& o) const
|
153
55
|
{
|
154
|
-
return
|
155
|
-
}
|
156
|
-
|
56
|
+
return tm != o.tm || idx != o.idx;
|
57
|
+
}
|
58
|
+
};
|
157
59
|
|
158
|
-
|
60
|
+
std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) const
|
61
|
+
{
|
62
|
+
auto& vocabFreqs = tm->getVocabCf();
|
63
|
+
auto& vocabDf = tm->getVocabDf();
|
64
|
+
auto candidates = extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
|
65
|
+
vocabFreqs, vocabDf,
|
66
|
+
candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, -99999.f
|
67
|
+
);
|
68
|
+
if (minLabelLen <= 1)
|
159
69
|
{
|
160
|
-
|
161
|
-
|
162
|
-
|
70
|
+
for (size_t i = 0; i < vocabDf.size(); ++i)
|
71
|
+
{
|
72
|
+
if (vocabFreqs[i] < candMinCnt) continue;
|
73
|
+
if (vocabDf[i] < candMinDf) continue;
|
74
|
+
candidates.emplace_back(0.f, i);
|
75
|
+
}
|
163
76
|
}
|
164
77
|
return candidates;
|
165
78
|
}
|
@@ -172,8 +85,7 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
|
|
172
85
|
auto node = root;
|
173
86
|
for (size_t j = 0; j < doc->words.size(); ++j)
|
174
87
|
{
|
175
|
-
|
176
|
-
tomoto::Vid curWord = doc->words[t];
|
88
|
+
tomoto::Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
|
177
89
|
if (curWord < tm->getV()) bdf[curWord] = 1;
|
178
90
|
auto nnode = node->getNext(curWord);
|
179
91
|
while (!nnode)
|
@@ -191,16 +103,15 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
|
|
191
103
|
// the matched candidate is found
|
192
104
|
if (nnode->val && nnode->val != (size_t)-1)
|
193
105
|
{
|
194
|
-
auto& c = candidates[nnode->val - 1];
|
195
106
|
tomoto::OptionalLock<_lock> lock{ mtx[(nnode->val - 1) % (pool ? pool->getNumWorkers() : 1)] };
|
107
|
+
auto& c = candidates[nnode->val - 1];
|
196
108
|
if (c.name.empty() && !doc->origWordPos.empty())
|
197
109
|
{
|
198
110
|
size_t start = doc->origWordPos[j + 1 - c.w.size()];
|
199
111
|
size_t end = doc->origWordPos[j] + doc->origWordLen[j];
|
200
112
|
c.names[doc->rawStr.substr(start, end - start)]++;
|
201
113
|
}
|
202
|
-
|
203
|
-
if (docIds.empty() || docIds.back() != docId) docIds.emplace_back(docId);
|
114
|
+
c.docIds.emplace(docId);
|
204
115
|
}
|
205
116
|
} while (nnode = nnode->getFail());
|
206
117
|
}
|
@@ -268,7 +179,22 @@ void FoRelevance::estimateContexts()
|
|
268
179
|
wordTopicDist.col(i) = Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ dist.data(), (Eigen::Index)dist.size() };
|
269
180
|
}
|
270
181
|
|
271
|
-
|
182
|
+
size_t totDocCnt = 0;
|
183
|
+
if (windowSize == (size_t)-1)
|
184
|
+
{
|
185
|
+
totDocCnt = tm->getNumDocs();
|
186
|
+
}
|
187
|
+
else
|
188
|
+
{
|
189
|
+
for (size_t i = 0; i < tm->getNumDocs(); ++i)
|
190
|
+
{
|
191
|
+
size_t s = tm->getDoc(i)->words.size();
|
192
|
+
if (s <= windowSize) totDocCnt += 1;
|
193
|
+
else totDocCnt += s - windowSize + 1;
|
194
|
+
}
|
195
|
+
}
|
196
|
+
|
197
|
+
auto calcScores = [&](CandidateEx& c, size_t windowSize)
|
272
198
|
{
|
273
199
|
if (c.docIds.size() < candMinDf) return;
|
274
200
|
if (c.name.empty() && !c.names.empty())
|
@@ -284,20 +210,80 @@ void FoRelevance::estimateContexts()
|
|
284
210
|
}
|
285
211
|
}
|
286
212
|
|
213
|
+
size_t docCnt = 0;
|
287
214
|
Eigen::Matrix<Float, -1, 1> wcPMI = Eigen::Matrix<Float, -1, 1>::Zero(this->tm->getV());
|
288
215
|
for (auto& docId : c.docIds)
|
289
216
|
{
|
290
217
|
thread_local Eigen::VectorXi bdf(this->tm->getV());
|
291
218
|
bdf.setZero();
|
292
219
|
auto doc = this->tm->getDoc(docId);
|
293
|
-
|
220
|
+
if (doc->words.size() <= windowSize)
|
294
221
|
{
|
295
|
-
|
222
|
+
for (size_t i = 0; i < doc->words.size(); ++i)
|
223
|
+
{
|
224
|
+
if (doc->words[i] < this->tm->getV()) bdf[doc->words[i]] = 1;
|
225
|
+
}
|
226
|
+
docCnt++;
|
227
|
+
wcPMI += bdf.template cast<Float>();
|
228
|
+
}
|
229
|
+
else
|
230
|
+
{
|
231
|
+
auto wit = c.w.begin();
|
232
|
+
std::deque<size_t> wpos;
|
233
|
+
for (size_t i = 0; i < windowSize; ++i)
|
234
|
+
{
|
235
|
+
Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
|
236
|
+
if (word < this->tm->getV()) bdf[word]++;
|
237
|
+
|
238
|
+
if (word == *wit)
|
239
|
+
{
|
240
|
+
if (++wit == c.w.end())
|
241
|
+
{
|
242
|
+
wpos.emplace_back(i + 1);
|
243
|
+
wit = c.w.begin();
|
244
|
+
}
|
245
|
+
}
|
246
|
+
else if (word == c.w[0]) wit = c.w.begin() + 1;
|
247
|
+
else wit = c.w.begin();
|
248
|
+
}
|
249
|
+
if (!wpos.empty())
|
250
|
+
{
|
251
|
+
docCnt++;
|
252
|
+
wcPMI += Eigen::bool2float(bdf.array()).matrix();
|
253
|
+
}
|
254
|
+
|
255
|
+
for (size_t i = windowSize; i < doc->words.size(); ++i)
|
256
|
+
{
|
257
|
+
Vid oword = doc->words[doc->wOrder.empty() ? (i - windowSize) : doc->wOrder[i - windowSize]];
|
258
|
+
Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
|
259
|
+
if (oword < this->tm->getV()) bdf[oword]--;
|
260
|
+
if (word < this->tm->getV()) bdf[word]++;
|
261
|
+
if (!wpos.empty() && wpos.front() - c.w.size() <= i - windowSize)
|
262
|
+
{
|
263
|
+
wpos.pop_front();
|
264
|
+
}
|
265
|
+
|
266
|
+
if (word == *wit)
|
267
|
+
{
|
268
|
+
if (++wit == c.w.end())
|
269
|
+
{
|
270
|
+
wpos.emplace_back(i + 1);
|
271
|
+
wit = c.w.begin();
|
272
|
+
}
|
273
|
+
}
|
274
|
+
else if (word == c.w[0]) wit = c.w.begin() + 1;
|
275
|
+
else wit = c.w.begin();
|
276
|
+
|
277
|
+
if (!wpos.empty())
|
278
|
+
{
|
279
|
+
docCnt++;
|
280
|
+
wcPMI += Eigen::bool2float(bdf.array()).matrix();
|
281
|
+
}
|
282
|
+
}
|
296
283
|
}
|
297
|
-
wcPMI += bdf.cast<Float>();
|
298
284
|
}
|
299
285
|
c.scores = wordTopicDist.transpose() *
|
300
|
-
((wcPMI.array() + smoothing) *
|
286
|
+
((wcPMI.array() + smoothing) * totDocCnt / docCnt / df.cast<Float>()).log().matrix();
|
301
287
|
};
|
302
288
|
|
303
289
|
if (pool)
|
@@ -311,7 +297,7 @@ void FoRelevance::estimateContexts()
|
|
311
297
|
{
|
312
298
|
for (size_t i = g; i < candidates.size(); i += groups)
|
313
299
|
{
|
314
|
-
calcScores(candidates[i]);
|
300
|
+
calcScores(candidates[i], windowSize);
|
315
301
|
}
|
316
302
|
}, g));
|
317
303
|
}
|
@@ -321,7 +307,7 @@ void FoRelevance::estimateContexts()
|
|
321
307
|
{
|
322
308
|
for (auto& c : candidates)
|
323
309
|
{
|
324
|
-
calcScores(c);
|
310
|
+
calcScores(c, windowSize);
|
325
311
|
}
|
326
312
|
}
|
327
313
|
|