tomoto 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/README.md +3 -3
- data/ext/tomoto/ext.cpp +34 -9
- data/ext/tomoto/extconf.rb +2 -1
- data/lib/tomoto/dmr.rb +1 -1
- data/lib/tomoto/gdmr.rb +1 -1
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/LICENSE +1 -1
- data/vendor/tomotopy/README.kr.rst +32 -3
- data/vendor/tomotopy/README.rst +30 -1
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
- data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
- data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
- data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
- data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
- data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
- data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
- data/vendor/tomotopy/src/Utils/math.h +1 -1
- data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
- data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
- data/vendor/variant/LICENSE +25 -0
- data/vendor/variant/LICENSE_1_0.txt +23 -0
- data/vendor/variant/README.md +102 -0
- data/vendor/variant/include/mapbox/optional.hpp +74 -0
- data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
- data/vendor/variant/include/mapbox/variant.hpp +974 -0
- data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
- metadata +15 -7
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 370ff6a569a6e476137cda0db349cf1b22ff0e7d54b0df67500ffbbe46846a86
|
|
4
|
+
data.tar.gz: 8df2fb740a85f8a51485e992dfacd6bef4d3c185d8d08369c72d93b9bb412f6d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 17abc53d6923a1fb24bfd040967944884cd7e1595ff4fb82c7af46a4bfae858875b2819b7e05fd2ff92e7fe17eec1350c551aa98bf88eedb344cde00a780f32a
|
|
7
|
+
data.tar.gz: 8f504f89679cf072fee04ac5b0319e164a509d140e65dd6f1c77e6fc9ecef252ce4ece0ac80087d0d69ddccf2a5b002910db58bb5212c4c2c7fa5d206b60d422
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
:tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
|
|
4
4
|
|
|
5
|
-
[](https://github.com/ankane/tomoto/actions)
|
|
6
6
|
|
|
7
7
|
## Installation
|
|
8
8
|
|
|
@@ -19,7 +19,7 @@ It can take 10-20 minutes to compile the extension.
|
|
|
19
19
|
Train a model
|
|
20
20
|
|
|
21
21
|
```ruby
|
|
22
|
-
model = Tomoto::LDA.new(k:
|
|
22
|
+
model = Tomoto::LDA.new(k: 2)
|
|
23
23
|
model.add_doc("text from document one")
|
|
24
24
|
model.add_doc("text from document two")
|
|
25
25
|
model.add_doc("text from document three")
|
|
@@ -98,7 +98,7 @@ If a method or option you need isn’t supported, feel free to open an issue.
|
|
|
98
98
|
## Examples
|
|
99
99
|
|
|
100
100
|
- [LDA](examples/lda_basic.rb)
|
|
101
|
-
- [HDP](examples/
|
|
101
|
+
- [HDP](examples/hdp_basic.rb)
|
|
102
102
|
|
|
103
103
|
## Tokenization
|
|
104
104
|
|
data/ext/tomoto/ext.cpp
CHANGED
|
@@ -96,6 +96,12 @@ std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
|
|
|
96
96
|
return res;
|
|
97
97
|
}
|
|
98
98
|
|
|
99
|
+
tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
|
|
100
|
+
tomoto::RawDoc doc;
|
|
101
|
+
doc.rawWords = words;
|
|
102
|
+
return doc;
|
|
103
|
+
}
|
|
104
|
+
|
|
99
105
|
extern "C"
|
|
100
106
|
void Init_ext()
|
|
101
107
|
{
|
|
@@ -126,7 +132,7 @@ void Init_ext()
|
|
|
126
132
|
.define_method(
|
|
127
133
|
"_add_doc",
|
|
128
134
|
*[](tomoto::ILDAModel& self, std::vector<std::string> words) {
|
|
129
|
-
self.addDoc(words);
|
|
135
|
+
self.addDoc(buildDoc(words));
|
|
130
136
|
})
|
|
131
137
|
.define_method(
|
|
132
138
|
"alpha",
|
|
@@ -379,8 +385,10 @@ void Init_ext()
|
|
|
379
385
|
})
|
|
380
386
|
.define_method(
|
|
381
387
|
"_add_doc",
|
|
382
|
-
*[](tomoto::IDMRModel& self, std::vector<std::string> words, std::
|
|
383
|
-
|
|
388
|
+
*[](tomoto::IDMRModel& self, std::vector<std::string> words, std::string metadata) {
|
|
389
|
+
auto doc = buildDoc(words);
|
|
390
|
+
doc.misc["metadata"] = metadata;
|
|
391
|
+
self.addDoc(doc);
|
|
384
392
|
})
|
|
385
393
|
.define_method(
|
|
386
394
|
"alpha_epsilon",
|
|
@@ -433,8 +441,10 @@ void Init_ext()
|
|
|
433
441
|
})
|
|
434
442
|
.define_method(
|
|
435
443
|
"_add_doc",
|
|
436
|
-
*[](tomoto::IDTModel& self, std::vector<std::string> words,
|
|
437
|
-
|
|
444
|
+
*[](tomoto::IDTModel& self, std::vector<std::string> words, uint32_t timepoint) {
|
|
445
|
+
auto doc = buildDoc(words);
|
|
446
|
+
doc.misc["timepoint"] = timepoint;
|
|
447
|
+
self.addDoc(doc);
|
|
438
448
|
})
|
|
439
449
|
.define_method(
|
|
440
450
|
"lr_a",
|
|
@@ -489,6 +499,13 @@ void Init_ext()
|
|
|
489
499
|
}
|
|
490
500
|
return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
|
|
491
501
|
})
|
|
502
|
+
.define_method(
|
|
503
|
+
"_add_doc",
|
|
504
|
+
*[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<tomoto::Float> metadata) {
|
|
505
|
+
auto doc = buildDoc(words);
|
|
506
|
+
doc.misc["metadata"] = metadata;
|
|
507
|
+
self.addDoc(doc);
|
|
508
|
+
})
|
|
492
509
|
.define_method(
|
|
493
510
|
"degrees",
|
|
494
511
|
*[](tomoto::IGDMRModel& self) {
|
|
@@ -643,7 +660,9 @@ void Init_ext()
|
|
|
643
660
|
.define_method(
|
|
644
661
|
"_add_doc",
|
|
645
662
|
*[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
|
|
646
|
-
|
|
663
|
+
auto doc = buildDoc(words);
|
|
664
|
+
doc.misc["delimiter"] = delimiter;
|
|
665
|
+
self.addDoc(doc);
|
|
647
666
|
})
|
|
648
667
|
.define_method(
|
|
649
668
|
"alpha_g",
|
|
@@ -708,7 +727,9 @@ void Init_ext()
|
|
|
708
727
|
.define_method(
|
|
709
728
|
"_add_doc",
|
|
710
729
|
*[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
|
|
711
|
-
|
|
730
|
+
auto doc = buildDoc(words);
|
|
731
|
+
doc.misc["labels"] = labels;
|
|
732
|
+
self.addDoc(doc);
|
|
712
733
|
})
|
|
713
734
|
.define_method(
|
|
714
735
|
"topics_per_label",
|
|
@@ -728,7 +749,9 @@ void Init_ext()
|
|
|
728
749
|
.define_method(
|
|
729
750
|
"_add_doc",
|
|
730
751
|
*[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
|
|
731
|
-
|
|
752
|
+
auto doc = buildDoc(words);
|
|
753
|
+
doc.misc["labels"] = labels;
|
|
754
|
+
self.addDoc(doc);
|
|
732
755
|
})
|
|
733
756
|
.define_method(
|
|
734
757
|
"latent_topics",
|
|
@@ -753,7 +776,9 @@ void Init_ext()
|
|
|
753
776
|
.define_method(
|
|
754
777
|
"_add_doc",
|
|
755
778
|
*[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
|
|
756
|
-
|
|
779
|
+
auto doc = buildDoc(words);
|
|
780
|
+
doc.misc["y"] = y;
|
|
781
|
+
self.addDoc(doc);
|
|
757
782
|
})
|
|
758
783
|
.define_method(
|
|
759
784
|
"f",
|
data/ext/tomoto/extconf.rb
CHANGED
|
@@ -23,9 +23,10 @@ ext = File.expand_path(".", __dir__)
|
|
|
23
23
|
tomoto = File.expand_path("../../vendor/tomotopy/src/TopicModel", __dir__)
|
|
24
24
|
eigen = File.expand_path("../../vendor/eigen", __dir__)
|
|
25
25
|
eigen_rand = File.expand_path("../../vendor/EigenRand", __dir__)
|
|
26
|
+
variant = File.expand_path("../../vendor/variant/include", __dir__)
|
|
26
27
|
|
|
27
28
|
$srcs = Dir["{#{ext},#{tomoto}}/*.cpp"]
|
|
28
|
-
$INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand}"
|
|
29
|
+
$INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand} -I#{variant}"
|
|
29
30
|
$VPATH << tomoto
|
|
30
31
|
|
|
31
32
|
create_makefile("tomoto/ext")
|
data/lib/tomoto/dmr.rb
CHANGED
data/lib/tomoto/gdmr.rb
CHANGED
data/lib/tomoto/version.rb
CHANGED
data/vendor/tomotopy/LICENSE
CHANGED
|
@@ -35,7 +35,7 @@ tomotopy 란?
|
|
|
35
35
|
|
|
36
36
|
더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
|
|
37
37
|
|
|
38
|
-
tomotopy의 가장 최신버전은 0.
|
|
38
|
+
tomotopy의 가장 최신버전은 0.10.0 입니다.
|
|
39
39
|
|
|
40
40
|
시작하기
|
|
41
41
|
---------------
|
|
@@ -255,6 +255,17 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
|
255
255
|
|
|
256
256
|
역사
|
|
257
257
|
-------
|
|
258
|
+
* 0.10.0 (2020-12-19)
|
|
259
|
+
* `tomotopy.utils.Corpus`와 `tomotopy.LDAModel.docs` 간의 인터페이스가 통일되었습니다. 이제 동일한 방법으로 코퍼스 내의 문헌들에 접근할 수 있습니다.
|
|
260
|
+
* `tomotopy.utils.Corpus`의 __getitem__이 개선되었습니다. int 타입 인덱싱뿐만 아니라 Iterable[int]나 slicing를 이용한 다중 인덱싱, uid를 이용한 인덱싱 등이 제공됩니다.
|
|
261
|
+
* `tomotopy.utils.Corpus.extract_ngrams`와 `tomotopy.utils.Corpus.concat_ngrams`이 추가되었습니다. PMI를 이용해 코퍼스 내에서 자동으로 n-gram collocation을 발견해 한 단어로 합치는 기능을 수행합니다.
|
|
262
|
+
* `tomotopy.LDAModel.add_corpus`가 추가되었고, `tomotopy.LDAModel.infer`가 Raw 코퍼스를 입력으로 받을 수 있게 되었습니다.
|
|
263
|
+
* `tomotopy.coherence` 모듈이 추가되었습니다. 생성된 토픽 모델의 coherence를 계산하는 기능을 담당합니다.
|
|
264
|
+
* `tomotopy.label.FoRelevance`에 window_size 파라미터가 추가되었습니다.
|
|
265
|
+
* `tomotopy.HDPModel` 학습 시 종종 NaN이 발생하는 문제를 해결했습니다.
|
|
266
|
+
* 이제 Python3.9를 지원합니다.
|
|
267
|
+
* py-cpuinfo에 대한 의존성이 제거되고, 모듈 로딩속도가 개선되었습니다.
|
|
268
|
+
|
|
258
269
|
* 0.9.1 (2020-08-08)
|
|
259
270
|
* 0.9.0 버전의 메모리 누수 문제가 해결되었습니다.
|
|
260
271
|
* `tomotopy.CTModel.summary()`가 잘못된 결과를 출력하는 문제가 해결되었습니다.
|
|
@@ -277,7 +288,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
|
277
288
|
|
|
278
289
|
* 0.8.2 (2020-07-14)
|
|
279
290
|
* `tomotopy.DTModel.num_timepoints`와 `tomotopy.DTModel.num_docs_by_timepoint` 프로퍼티가 추가되었습니다.
|
|
280
|
-
* `seed`가
|
|
291
|
+
* `seed`가 동일해도 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.
|
|
281
292
|
|
|
282
293
|
* 0.8.1 (2020-06-08)
|
|
283
294
|
* `tomotopy.LDAModel.used_vocabs`가 잘못된 값을 반환하는 버그가 수정되었습니다.
|
|
@@ -302,7 +313,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
|
302
313
|
* 0.7.0 (2020-04-18)
|
|
303
314
|
* `tomotopy.DTModel`이 추가되었습니다.
|
|
304
315
|
* `tomotopy.utils.Corpus.save`가 제대로 작동하지 않는 버그가 수정되었습니다.
|
|
305
|
-
* `tomotopy.
|
|
316
|
+
* `tomotopy.Document.get_count_vector`가 추가되었습니다.
|
|
306
317
|
* 리눅스용 바이너리가 manylinux2010 버전으로 변경되었고 이에 따른 최적화가 진행되었습니다.
|
|
307
318
|
|
|
308
319
|
* 0.6.2 (2020-03-28)
|
|
@@ -373,3 +384,21 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
|
373
384
|
|
|
374
385
|
* 0.1.0 (2019-05-12)
|
|
375
386
|
* **tomotopy**의 최초 버전
|
|
387
|
+
|
|
388
|
+
다른 언어용 바인딩
|
|
389
|
+
-------------------
|
|
390
|
+
|
|
391
|
+
* Ruby: https://github.com/ankane/tomoto
|
|
392
|
+
|
|
393
|
+
포함된 라이브러리들의 라이센스
|
|
394
|
+
-------------------------------
|
|
395
|
+
* Eigen:
|
|
396
|
+
This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
|
|
397
|
+
A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
|
|
398
|
+
The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
|
|
399
|
+
|
|
400
|
+
* EigenRand: `MIT License
|
|
401
|
+
<licenses_bundled/EigenRand>`_
|
|
402
|
+
|
|
403
|
+
* Mapbox Variant: `BSD License
|
|
404
|
+
<licenses_bundled/MapboxVariant>`_
|
data/vendor/tomotopy/README.rst
CHANGED
|
@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
|
|
|
36
36
|
|
|
37
37
|
Please visit https://bab2min.github.io/tomotopy to see more information.
|
|
38
38
|
|
|
39
|
-
The most recent version of tomotopy is 0.
|
|
39
|
+
The most recent version of tomotopy is 0.10.0.
|
|
40
40
|
|
|
41
41
|
Getting Started
|
|
42
42
|
---------------
|
|
@@ -261,6 +261,17 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
|
|
|
261
261
|
|
|
262
262
|
History
|
|
263
263
|
-------
|
|
264
|
+
* 0.10.0 (2020-12-19)
|
|
265
|
+
* The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
|
|
266
|
+
* __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
|
|
267
|
+
* New methods `tomotopy.utils.Corpus.extract_ngrams` and `tomotopy.utils.Corpus.concat_ngrams` were added. They extracts n-gram collocations using PMI and concatenates them into a single words.
|
|
268
|
+
* A new method `tomotopy.LDAModel.add_corpus` was added, and `tomotopy.LDAModel.infer` can receive corpus as input.
|
|
269
|
+
* A new module `tomotopy.coherence` was added. It provides the way to calculate coherence of the model.
|
|
270
|
+
* A paramter `window_size` was added to `tomotopy.label.FoRelevance`.
|
|
271
|
+
* An issue was fixed where NaN often occurs when training `tomotopy.HDPModel`.
|
|
272
|
+
* Now Python3.9 is supported.
|
|
273
|
+
* A dependency to py-cpuinfo was removed and the initializing of the module was improved.
|
|
274
|
+
|
|
264
275
|
* 0.9.1 (2020-08-08)
|
|
265
276
|
* Memory leaks of version 0.9.0 was fixed.
|
|
266
277
|
* `tomotopy.CTModel.summary()` was fixed.
|
|
@@ -380,3 +391,21 @@ History
|
|
|
380
391
|
|
|
381
392
|
* 0.1.0 (2019-05-12)
|
|
382
393
|
* First version of **tomotopy**
|
|
394
|
+
|
|
395
|
+
Bindings for Other Languages
|
|
396
|
+
------------------------------
|
|
397
|
+
|
|
398
|
+
* Ruby: https://github.com/ankane/tomoto
|
|
399
|
+
|
|
400
|
+
Bundled Libraries and Their License
|
|
401
|
+
------------------------------------
|
|
402
|
+
* Eigen:
|
|
403
|
+
This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
|
|
404
|
+
A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
|
|
405
|
+
The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
|
|
406
|
+
|
|
407
|
+
* EigenRand: `MIT License
|
|
408
|
+
<licenses_bundled/EigenRand>`_
|
|
409
|
+
|
|
410
|
+
* Mapbox Variant: `BSD License
|
|
411
|
+
<licenses_bundled/MapboxVariant>`_
|
|
@@ -5,161 +5,74 @@
|
|
|
5
5
|
|
|
6
6
|
using namespace tomoto::label;
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
class DocWrapper
|
|
9
9
|
{
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
const tomoto::DocumentBase* doc;
|
|
11
|
+
public:
|
|
12
|
+
DocWrapper(const tomoto::DocumentBase* _doc = nullptr)
|
|
13
|
+
: doc{ _doc }
|
|
12
14
|
{
|
|
13
|
-
|
|
14
|
-
{
|
|
15
|
-
return hash<tomoto::Vid>{}(k.first) ^ hash<tomoto::Vid>{}(k.second);
|
|
16
|
-
}
|
|
17
|
-
};
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel * tm) const
|
|
21
|
-
{
|
|
22
|
-
auto& vocabFreqs = tm->getVocabCf();
|
|
23
|
-
auto& vocabDf = tm->getVocabDf();
|
|
24
|
-
|
|
25
|
-
// counting unigrams & bigrams
|
|
26
|
-
std::unordered_map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf;
|
|
15
|
+
}
|
|
27
16
|
|
|
28
|
-
|
|
17
|
+
size_t size() const
|
|
29
18
|
{
|
|
30
|
-
|
|
31
|
-
auto doc = tm->getDoc(i);
|
|
32
|
-
Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
|
|
33
|
-
for (size_t j = 1; j < doc->words.size(); ++j)
|
|
34
|
-
{
|
|
35
|
-
Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
|
|
36
|
-
if (vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
|
|
37
|
-
{
|
|
38
|
-
if (vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
|
|
39
|
-
{
|
|
40
|
-
bigramCnt[std::make_pair(prevWord, curWord)]++;
|
|
41
|
-
uniqBigram.emplace(prevWord, curWord);
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
prevWord = curWord;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
for (auto& p : uniqBigram) bigramDf[p]++;
|
|
19
|
+
return doc->words.size();
|
|
48
20
|
}
|
|
49
21
|
|
|
50
|
-
|
|
51
|
-
// counting ngrams
|
|
52
|
-
std::vector<TrieEx<Vid, size_t>> trieNodes;
|
|
53
|
-
|
|
54
|
-
if (maxLabelLen > 2)
|
|
22
|
+
tomoto::Vid operator[](size_t idx) const
|
|
55
23
|
{
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
if (p.second >= candMinCnt) validPair.emplace(p.first);
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
trieNodes.resize(1);
|
|
63
|
-
auto allocNode = [&]() { return trieNodes.emplace_back(), &trieNodes.back(); };
|
|
64
|
-
|
|
65
|
-
for (size_t i = 0; i < tm->getNumDocs(); ++i)
|
|
66
|
-
{
|
|
67
|
-
auto doc = tm->getDoc(i);
|
|
68
|
-
if (trieNodes.capacity() < trieNodes.size() + doc->words.size() * maxLabelLen)
|
|
69
|
-
{
|
|
70
|
-
trieNodes.reserve(std::max(trieNodes.size() + doc->words.size() * maxLabelLen, trieNodes.capacity() * 2));
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
|
|
74
|
-
size_t labelLen = 0;
|
|
75
|
-
auto node = &trieNodes[0];
|
|
76
|
-
if (vocabFreqs[prevWord] >= candMinCnt)
|
|
77
|
-
{
|
|
78
|
-
node = trieNodes[0].makeNext(prevWord, allocNode);
|
|
79
|
-
node->val++;
|
|
80
|
-
labelLen = 1;
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
for (size_t j = 1; j < doc->words.size(); ++j)
|
|
84
|
-
{
|
|
85
|
-
Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
|
|
24
|
+
return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
|
|
25
|
+
}
|
|
26
|
+
};
|
|
86
27
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
node = node->getFail();
|
|
97
|
-
labelLen--;
|
|
98
|
-
}
|
|
28
|
+
class DocIterator
|
|
29
|
+
{
|
|
30
|
+
const tomoto::ITopicModel* tm;
|
|
31
|
+
size_t idx;
|
|
32
|
+
public:
|
|
33
|
+
DocIterator(const tomoto::ITopicModel* _tm = nullptr, size_t _idx = 0)
|
|
34
|
+
: tm{ _tm }, idx{ _idx }
|
|
35
|
+
{
|
|
36
|
+
}
|
|
99
37
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
node = nnode;
|
|
104
|
-
do
|
|
105
|
-
{
|
|
106
|
-
nnode->val++;
|
|
107
|
-
} while (nnode = nnode->getFail());
|
|
108
|
-
labelLen++;
|
|
109
|
-
}
|
|
110
|
-
else
|
|
111
|
-
{
|
|
112
|
-
node = trieNodes[0].makeNext(curWord, allocNode);
|
|
113
|
-
node->val++;
|
|
114
|
-
labelLen = 1;
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
prevWord = curWord;
|
|
118
|
-
}
|
|
119
|
-
}
|
|
38
|
+
DocWrapper operator*() const
|
|
39
|
+
{
|
|
40
|
+
return { tm->getDoc(idx) };
|
|
120
41
|
}
|
|
121
42
|
|
|
122
|
-
|
|
123
|
-
std::vector<Candidate> candidates;
|
|
124
|
-
for (auto& p : bigramCnt)
|
|
43
|
+
DocIterator& operator++()
|
|
125
44
|
{
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
if (bigramDf[bigram] < candMinDf) continue;
|
|
129
|
-
auto pmi = std::log(p.second * (float)tm->getN()
|
|
130
|
-
/ vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
|
|
131
|
-
if (pmi <= 0) continue;
|
|
132
|
-
candidates.emplace_back(pmi, bigram.first, bigram.second);
|
|
45
|
+
++idx;
|
|
46
|
+
return *this;
|
|
133
47
|
}
|
|
134
48
|
|
|
135
|
-
|
|
49
|
+
bool operator==(const DocIterator& o) const
|
|
136
50
|
{
|
|
137
|
-
|
|
138
|
-
trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
|
|
139
|
-
{
|
|
140
|
-
if (rkeys.size() <= 2 || node->val < candMinCnt) return;
|
|
141
|
-
float n = (float)tm->getN();
|
|
142
|
-
auto pmi = node->val / n;
|
|
143
|
-
for (auto k : rkeys)
|
|
144
|
-
{
|
|
145
|
-
pmi *= n / vocabFreqs[k];
|
|
146
|
-
}
|
|
147
|
-
pmi = std::log(pmi);
|
|
148
|
-
candidates.emplace_back(pmi, rkeys);
|
|
149
|
-
}, rkeys);
|
|
51
|
+
return tm == o.tm && idx == o.idx;
|
|
150
52
|
}
|
|
151
53
|
|
|
152
|
-
|
|
54
|
+
bool operator!=(const DocIterator& o) const
|
|
153
55
|
{
|
|
154
|
-
return
|
|
155
|
-
}
|
|
156
|
-
|
|
56
|
+
return tm != o.tm || idx != o.idx;
|
|
57
|
+
}
|
|
58
|
+
};
|
|
157
59
|
|
|
158
|
-
|
|
60
|
+
std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) const
|
|
61
|
+
{
|
|
62
|
+
auto& vocabFreqs = tm->getVocabCf();
|
|
63
|
+
auto& vocabDf = tm->getVocabDf();
|
|
64
|
+
auto candidates = extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
|
|
65
|
+
vocabFreqs, vocabDf,
|
|
66
|
+
candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, -99999.f
|
|
67
|
+
);
|
|
68
|
+
if (minLabelLen <= 1)
|
|
159
69
|
{
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
70
|
+
for (size_t i = 0; i < vocabDf.size(); ++i)
|
|
71
|
+
{
|
|
72
|
+
if (vocabFreqs[i] < candMinCnt) continue;
|
|
73
|
+
if (vocabDf[i] < candMinDf) continue;
|
|
74
|
+
candidates.emplace_back(0.f, i);
|
|
75
|
+
}
|
|
163
76
|
}
|
|
164
77
|
return candidates;
|
|
165
78
|
}
|
|
@@ -172,8 +85,7 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
|
|
|
172
85
|
auto node = root;
|
|
173
86
|
for (size_t j = 0; j < doc->words.size(); ++j)
|
|
174
87
|
{
|
|
175
|
-
|
|
176
|
-
tomoto::Vid curWord = doc->words[t];
|
|
88
|
+
tomoto::Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
|
|
177
89
|
if (curWord < tm->getV()) bdf[curWord] = 1;
|
|
178
90
|
auto nnode = node->getNext(curWord);
|
|
179
91
|
while (!nnode)
|
|
@@ -191,16 +103,15 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
|
|
|
191
103
|
// the matched candidate is found
|
|
192
104
|
if (nnode->val && nnode->val != (size_t)-1)
|
|
193
105
|
{
|
|
194
|
-
auto& c = candidates[nnode->val - 1];
|
|
195
106
|
tomoto::OptionalLock<_lock> lock{ mtx[(nnode->val - 1) % (pool ? pool->getNumWorkers() : 1)] };
|
|
107
|
+
auto& c = candidates[nnode->val - 1];
|
|
196
108
|
if (c.name.empty() && !doc->origWordPos.empty())
|
|
197
109
|
{
|
|
198
110
|
size_t start = doc->origWordPos[j + 1 - c.w.size()];
|
|
199
111
|
size_t end = doc->origWordPos[j] + doc->origWordLen[j];
|
|
200
112
|
c.names[doc->rawStr.substr(start, end - start)]++;
|
|
201
113
|
}
|
|
202
|
-
|
|
203
|
-
if (docIds.empty() || docIds.back() != docId) docIds.emplace_back(docId);
|
|
114
|
+
c.docIds.emplace(docId);
|
|
204
115
|
}
|
|
205
116
|
} while (nnode = nnode->getFail());
|
|
206
117
|
}
|
|
@@ -268,7 +179,22 @@ void FoRelevance::estimateContexts()
|
|
|
268
179
|
wordTopicDist.col(i) = Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ dist.data(), (Eigen::Index)dist.size() };
|
|
269
180
|
}
|
|
270
181
|
|
|
271
|
-
|
|
182
|
+
size_t totDocCnt = 0;
|
|
183
|
+
if (windowSize == (size_t)-1)
|
|
184
|
+
{
|
|
185
|
+
totDocCnt = tm->getNumDocs();
|
|
186
|
+
}
|
|
187
|
+
else
|
|
188
|
+
{
|
|
189
|
+
for (size_t i = 0; i < tm->getNumDocs(); ++i)
|
|
190
|
+
{
|
|
191
|
+
size_t s = tm->getDoc(i)->words.size();
|
|
192
|
+
if (s <= windowSize) totDocCnt += 1;
|
|
193
|
+
else totDocCnt += s - windowSize + 1;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
auto calcScores = [&](CandidateEx& c, size_t windowSize)
|
|
272
198
|
{
|
|
273
199
|
if (c.docIds.size() < candMinDf) return;
|
|
274
200
|
if (c.name.empty() && !c.names.empty())
|
|
@@ -284,20 +210,80 @@ void FoRelevance::estimateContexts()
|
|
|
284
210
|
}
|
|
285
211
|
}
|
|
286
212
|
|
|
213
|
+
size_t docCnt = 0;
|
|
287
214
|
Eigen::Matrix<Float, -1, 1> wcPMI = Eigen::Matrix<Float, -1, 1>::Zero(this->tm->getV());
|
|
288
215
|
for (auto& docId : c.docIds)
|
|
289
216
|
{
|
|
290
217
|
thread_local Eigen::VectorXi bdf(this->tm->getV());
|
|
291
218
|
bdf.setZero();
|
|
292
219
|
auto doc = this->tm->getDoc(docId);
|
|
293
|
-
|
|
220
|
+
if (doc->words.size() <= windowSize)
|
|
294
221
|
{
|
|
295
|
-
|
|
222
|
+
for (size_t i = 0; i < doc->words.size(); ++i)
|
|
223
|
+
{
|
|
224
|
+
if (doc->words[i] < this->tm->getV()) bdf[doc->words[i]] = 1;
|
|
225
|
+
}
|
|
226
|
+
docCnt++;
|
|
227
|
+
wcPMI += bdf.template cast<Float>();
|
|
228
|
+
}
|
|
229
|
+
else
|
|
230
|
+
{
|
|
231
|
+
auto wit = c.w.begin();
|
|
232
|
+
std::deque<size_t> wpos;
|
|
233
|
+
for (size_t i = 0; i < windowSize; ++i)
|
|
234
|
+
{
|
|
235
|
+
Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
|
|
236
|
+
if (word < this->tm->getV()) bdf[word]++;
|
|
237
|
+
|
|
238
|
+
if (word == *wit)
|
|
239
|
+
{
|
|
240
|
+
if (++wit == c.w.end())
|
|
241
|
+
{
|
|
242
|
+
wpos.emplace_back(i + 1);
|
|
243
|
+
wit = c.w.begin();
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
else if (word == c.w[0]) wit = c.w.begin() + 1;
|
|
247
|
+
else wit = c.w.begin();
|
|
248
|
+
}
|
|
249
|
+
if (!wpos.empty())
|
|
250
|
+
{
|
|
251
|
+
docCnt++;
|
|
252
|
+
wcPMI += Eigen::bool2float(bdf.array()).matrix();
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
for (size_t i = windowSize; i < doc->words.size(); ++i)
|
|
256
|
+
{
|
|
257
|
+
Vid oword = doc->words[doc->wOrder.empty() ? (i - windowSize) : doc->wOrder[i - windowSize]];
|
|
258
|
+
Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
|
|
259
|
+
if (oword < this->tm->getV()) bdf[oword]--;
|
|
260
|
+
if (word < this->tm->getV()) bdf[word]++;
|
|
261
|
+
if (!wpos.empty() && wpos.front() - c.w.size() <= i - windowSize)
|
|
262
|
+
{
|
|
263
|
+
wpos.pop_front();
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
if (word == *wit)
|
|
267
|
+
{
|
|
268
|
+
if (++wit == c.w.end())
|
|
269
|
+
{
|
|
270
|
+
wpos.emplace_back(i + 1);
|
|
271
|
+
wit = c.w.begin();
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
else if (word == c.w[0]) wit = c.w.begin() + 1;
|
|
275
|
+
else wit = c.w.begin();
|
|
276
|
+
|
|
277
|
+
if (!wpos.empty())
|
|
278
|
+
{
|
|
279
|
+
docCnt++;
|
|
280
|
+
wcPMI += Eigen::bool2float(bdf.array()).matrix();
|
|
281
|
+
}
|
|
282
|
+
}
|
|
296
283
|
}
|
|
297
|
-
wcPMI += bdf.cast<Float>();
|
|
298
284
|
}
|
|
299
285
|
c.scores = wordTopicDist.transpose() *
|
|
300
|
-
((wcPMI.array() + smoothing) *
|
|
286
|
+
((wcPMI.array() + smoothing) * totDocCnt / docCnt / df.cast<Float>()).log().matrix();
|
|
301
287
|
};
|
|
302
288
|
|
|
303
289
|
if (pool)
|
|
@@ -311,7 +297,7 @@ void FoRelevance::estimateContexts()
|
|
|
311
297
|
{
|
|
312
298
|
for (size_t i = g; i < candidates.size(); i += groups)
|
|
313
299
|
{
|
|
314
|
-
calcScores(candidates[i]);
|
|
300
|
+
calcScores(candidates[i], windowSize);
|
|
315
301
|
}
|
|
316
302
|
}, g));
|
|
317
303
|
}
|
|
@@ -321,7 +307,7 @@ void FoRelevance::estimateContexts()
|
|
|
321
307
|
{
|
|
322
308
|
for (auto& c : candidates)
|
|
323
309
|
{
|
|
324
|
-
calcScores(c);
|
|
310
|
+
calcScores(c, windowSize);
|
|
325
311
|
}
|
|
326
312
|
}
|
|
327
313
|
|