tomoto 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -3
  5. data/ext/tomoto/ext.cpp +34 -9
  6. data/ext/tomoto/extconf.rb +2 -1
  7. data/lib/tomoto/dmr.rb +1 -1
  8. data/lib/tomoto/gdmr.rb +1 -1
  9. data/lib/tomoto/version.rb +1 -1
  10. data/vendor/tomotopy/LICENSE +1 -1
  11. data/vendor/tomotopy/README.kr.rst +32 -3
  12. data/vendor/tomotopy/README.rst +30 -1
  13. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
  14. data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
  15. data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
  16. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
  17. data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
  18. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
  19. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
  20. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
  21. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
  22. data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
  23. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
  24. data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
  25. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
  26. data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
  27. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
  28. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
  29. data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
  30. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
  31. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
  32. data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
  33. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
  34. data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
  35. data/vendor/tomotopy/src/Utils/math.h +1 -1
  36. data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
  37. data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
  38. data/vendor/variant/LICENSE +25 -0
  39. data/vendor/variant/LICENSE_1_0.txt +23 -0
  40. data/vendor/variant/README.md +102 -0
  41. data/vendor/variant/include/mapbox/optional.hpp +74 -0
  42. data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
  43. data/vendor/variant/include/mapbox/variant.hpp +974 -0
  44. data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
  45. metadata +15 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dd4c36ff621f73c38bb066694a932f0a682c18591ddf05a9a0764bea0b6e4430
4
- data.tar.gz: 551e56c4bc17fb5a3a0aeac0db055960fcc5e45bf097bf88c7cbf9046f958e7d
3
+ metadata.gz: 370ff6a569a6e476137cda0db349cf1b22ff0e7d54b0df67500ffbbe46846a86
4
+ data.tar.gz: 8df2fb740a85f8a51485e992dfacd6bef4d3c185d8d08369c72d93b9bb412f6d
5
5
  SHA512:
6
- metadata.gz: 565a91d0bb6d48142f38dc3d9e798ddb99bf41fda32762295362075fba972eea6b56b6bde126eab74677eba5fd525581b68c5efa73361a46fcb0b2796ab63684
7
- data.tar.gz: 415193e4eb6adbe5dce05328aadf9acb91f4acc50951484183a956455d7336f93961fe145465b1eeffaae78dad37ee1452defe832514c72b3c032860ed433cc8
6
+ metadata.gz: 17abc53d6923a1fb24bfd040967944884cd7e1595ff4fb82c7af46a4bfae858875b2819b7e05fd2ff92e7fe17eec1350c551aa98bf88eedb344cde00a780f32a
7
+ data.tar.gz: 8f504f89679cf072fee04ac5b0319e164a509d140e65dd6f1c77e6fc9ecef252ce4ece0ac80087d0d69ddccf2a5b002910db58bb5212c4c2c7fa5d206b60d422
@@ -1,3 +1,7 @@
1
+ ## 0.1.3 (2020-12-19)
2
+
3
+ - Updated tomoto to 0.10.0
4
+
1
5
  ## 0.1.2 (2020-10-10)
2
6
 
3
7
  - Added `summary` method
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2019
3
+ Copyright (c) 2019, bab2min
4
4
  Copyright (c) 2020 Andrew Kane
5
5
 
6
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  :tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
4
4
 
5
- [![Build Status](https://travis-ci.org/ankane/tomoto.svg?branch=master)](https://travis-ci.org/ankane/tomoto)
5
+ [![Build Status](https://github.com/ankane/tomoto/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tomoto/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -19,7 +19,7 @@ It can take 10-20 minutes to compile the extension.
19
19
  Train a model
20
20
 
21
21
  ```ruby
22
- model = Tomoto::LDA.new(k: 3)
22
+ model = Tomoto::LDA.new(k: 2)
23
23
  model.add_doc("text from document one")
24
24
  model.add_doc("text from document two")
25
25
  model.add_doc("text from document three")
@@ -98,7 +98,7 @@ If a method or option you need isn’t supported, feel free to open an issue.
98
98
  ## Examples
99
99
 
100
100
  - [LDA](examples/lda_basic.rb)
101
- - [HDP](examples/hdp.rb)
101
+ - [HDP](examples/hdp_basic.rb)
102
102
 
103
103
  ## Tokenization
104
104
 
@@ -96,6 +96,12 @@ std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
96
96
  return res;
97
97
  }
98
98
 
99
+ tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
100
+ tomoto::RawDoc doc;
101
+ doc.rawWords = words;
102
+ return doc;
103
+ }
104
+
99
105
  extern "C"
100
106
  void Init_ext()
101
107
  {
@@ -126,7 +132,7 @@ void Init_ext()
126
132
  .define_method(
127
133
  "_add_doc",
128
134
  *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
129
- self.addDoc(words);
135
+ self.addDoc(buildDoc(words));
130
136
  })
131
137
  .define_method(
132
138
  "alpha",
@@ -379,8 +385,10 @@ void Init_ext()
379
385
  })
380
386
  .define_method(
381
387
  "_add_doc",
382
- *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<std::string> metadata) {
383
- self.addDoc(words, metadata);
388
+ *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::string metadata) {
389
+ auto doc = buildDoc(words);
390
+ doc.misc["metadata"] = metadata;
391
+ self.addDoc(doc);
384
392
  })
385
393
  .define_method(
386
394
  "alpha_epsilon",
@@ -433,8 +441,10 @@ void Init_ext()
433
441
  })
434
442
  .define_method(
435
443
  "_add_doc",
436
- *[](tomoto::IDTModel& self, std::vector<std::string> words, size_t timepoint) {
437
- self.addDoc(words, timepoint);
444
+ *[](tomoto::IDTModel& self, std::vector<std::string> words, uint32_t timepoint) {
445
+ auto doc = buildDoc(words);
446
+ doc.misc["timepoint"] = timepoint;
447
+ self.addDoc(doc);
438
448
  })
439
449
  .define_method(
440
450
  "lr_a",
@@ -489,6 +499,13 @@ void Init_ext()
489
499
  }
490
500
  return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
491
501
  })
502
+ .define_method(
503
+ "_add_doc",
504
+ *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<tomoto::Float> metadata) {
505
+ auto doc = buildDoc(words);
506
+ doc.misc["metadata"] = metadata;
507
+ self.addDoc(doc);
508
+ })
492
509
  .define_method(
493
510
  "degrees",
494
511
  *[](tomoto::IGDMRModel& self) {
@@ -643,7 +660,9 @@ void Init_ext()
643
660
  .define_method(
644
661
  "_add_doc",
645
662
  *[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
646
- self.addDoc(words, delimiter);
663
+ auto doc = buildDoc(words);
664
+ doc.misc["delimiter"] = delimiter;
665
+ self.addDoc(doc);
647
666
  })
648
667
  .define_method(
649
668
  "alpha_g",
@@ -708,7 +727,9 @@ void Init_ext()
708
727
  .define_method(
709
728
  "_add_doc",
710
729
  *[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
711
- self.addDoc(words, labels);
730
+ auto doc = buildDoc(words);
731
+ doc.misc["labels"] = labels;
732
+ self.addDoc(doc);
712
733
  })
713
734
  .define_method(
714
735
  "topics_per_label",
@@ -728,7 +749,9 @@ void Init_ext()
728
749
  .define_method(
729
750
  "_add_doc",
730
751
  *[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
731
- self.addDoc(words, labels);
752
+ auto doc = buildDoc(words);
753
+ doc.misc["labels"] = labels;
754
+ self.addDoc(doc);
732
755
  })
733
756
  .define_method(
734
757
  "latent_topics",
@@ -753,7 +776,9 @@ void Init_ext()
753
776
  .define_method(
754
777
  "_add_doc",
755
778
  *[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
756
- self.addDoc(words, y);
779
+ auto doc = buildDoc(words);
780
+ doc.misc["y"] = y;
781
+ self.addDoc(doc);
757
782
  })
758
783
  .define_method(
759
784
  "f",
@@ -23,9 +23,10 @@ ext = File.expand_path(".", __dir__)
23
23
  tomoto = File.expand_path("../../vendor/tomotopy/src/TopicModel", __dir__)
24
24
  eigen = File.expand_path("../../vendor/eigen", __dir__)
25
25
  eigen_rand = File.expand_path("../../vendor/EigenRand", __dir__)
26
+ variant = File.expand_path("../../vendor/variant/include", __dir__)
26
27
 
27
28
  $srcs = Dir["{#{ext},#{tomoto}}/*.cpp"]
28
- $INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand}"
29
+ $INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand} -I#{variant}"
29
30
  $VPATH << tomoto
30
31
 
31
32
  create_makefile("tomoto/ext")
@@ -9,7 +9,7 @@ module Tomoto
9
9
  end
10
10
 
11
11
  def add_doc(doc, metadata: "")
12
- _add_doc(prepare_doc(doc), [metadata])
12
+ _add_doc(prepare_doc(doc), metadata)
13
13
  end
14
14
 
15
15
  def lambdas
@@ -9,7 +9,7 @@ module Tomoto
9
9
  end
10
10
 
11
11
  def add_doc(doc, metadata: [])
12
- _add_doc(prepare_doc(doc), metadata.map(&:to_s))
12
+ _add_doc(prepare_doc(doc), metadata)
13
13
  end
14
14
  end
15
15
  end
@@ -1,3 +1,3 @@
1
1
  module Tomoto
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2019
3
+ Copyright (c) 2019, bab2min
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -35,7 +35,7 @@ tomotopy 란?
35
35
 
36
36
  더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
37
37
 
38
- tomotopy의 가장 최신버전은 0.9.1 입니다.
38
+ tomotopy의 가장 최신버전은 0.10.0 입니다.
39
39
 
40
40
  시작하기
41
41
  ---------------
@@ -255,6 +255,17 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
255
255
 
256
256
  역사
257
257
  -------
258
+ * 0.10.0 (2020-12-19)
259
+ * `tomotopy.utils.Corpus`와 `tomotopy.LDAModel.docs` 간의 인터페이스가 통일되었습니다. 이제 동일한 방법으로 코퍼스 내의 문헌들에 접근할 수 있습니다.
260
+ * `tomotopy.utils.Corpus`의 __getitem__이 개선되었습니다. int 타입 인덱싱뿐만 아니라 Iterable[int]나 slicing를 이용한 다중 인덱싱, uid를 이용한 인덱싱 등이 제공됩니다.
261
+ * `tomotopy.utils.Corpus.extract_ngrams`와 `tomotopy.utils.Corpus.concat_ngrams`이 추가되었습니다. PMI를 이용해 코퍼스 내에서 자동으로 n-gram collocation을 발견해 한 단어로 합치는 기능을 수행합니다.
262
+ * `tomotopy.LDAModel.add_corpus`가 추가되었고, `tomotopy.LDAModel.infer`가 Raw 코퍼스를 입력으로 받을 수 있게 되었습니다.
263
+ * `tomotopy.coherence` 모듈이 추가되었습니다. 생성된 토픽 모델의 coherence를 계산하는 기능을 담당합니다.
264
+ * `tomotopy.label.FoRelevance`에 window_size 파라미터가 추가되었습니다.
265
+ * `tomotopy.HDPModel` 학습 시 종종 NaN이 발생하는 문제를 해결했습니다.
266
+ * 이제 Python3.9를 지원합니다.
267
+ * py-cpuinfo에 대한 의존성이 제거되고, 모듈 로딩속도가 개선되었습니다.
268
+
258
269
  * 0.9.1 (2020-08-08)
259
270
  * 0.9.0 버전의 메모리 누수 문제가 해결되었습니다.
260
271
  * `tomotopy.CTModel.summary()`가 잘못된 결과를 출력하는 문제가 해결되었습니다.
@@ -277,7 +288,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
277
288
 
278
289
  * 0.8.2 (2020-07-14)
279
290
  * `tomotopy.DTModel.num_timepoints`와 `tomotopy.DTModel.num_docs_by_timepoint` 프로퍼티가 추가되었습니다.
280
- * `seed`가 동일해서 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.
291
+ * `seed`가 동일해도 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.
281
292
 
282
293
  * 0.8.1 (2020-06-08)
283
294
  * `tomotopy.LDAModel.used_vocabs`가 잘못된 값을 반환하는 버그가 수정되었습니다.
@@ -302,7 +313,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
302
313
  * 0.7.0 (2020-04-18)
303
314
  * `tomotopy.DTModel`이 추가되었습니다.
304
315
  * `tomotopy.utils.Corpus.save`가 제대로 작동하지 않는 버그가 수정되었습니다.
305
- * `tomotopy.LDAModel.get_count_vector`가 추가되었습니다.
316
+ * `tomotopy.Document.get_count_vector`가 추가되었습니다.
306
317
  * 리눅스용 바이너리가 manylinux2010 버전으로 변경되었고 이에 따른 최적화가 진행되었습니다.
307
318
 
308
319
  * 0.6.2 (2020-03-28)
@@ -373,3 +384,21 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
373
384
 
374
385
  * 0.1.0 (2019-05-12)
375
386
  * **tomotopy**의 최초 버전
387
+
388
+ 다른 언어용 바인딩
389
+ -------------------
390
+
391
+ * Ruby: https://github.com/ankane/tomoto
392
+
393
+ 포함된 라이브러리들의 라이센스
394
+ -------------------------------
395
+ * Eigen:
396
+ This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
397
+ A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
398
+ The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
399
+
400
+ * EigenRand: `MIT License
401
+ <licenses_bundled/EigenRand>`_
402
+
403
+ * Mapbox Variant: `BSD License
404
+ <licenses_bundled/MapboxVariant>`_
@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
36
36
 
37
37
  Please visit https://bab2min.github.io/tomotopy to see more information.
38
38
 
39
- The most recent version of tomotopy is 0.9.1.
39
+ The most recent version of tomotopy is 0.10.0.
40
40
 
41
41
  Getting Started
42
42
  ---------------
@@ -261,6 +261,17 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
261
261
 
262
262
  History
263
263
  -------
264
+ * 0.10.0 (2020-12-19)
265
+ * The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
266
+ * __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
267
+ * New methods `tomotopy.utils.Corpus.extract_ngrams` and `tomotopy.utils.Corpus.concat_ngrams` were added. They extracts n-gram collocations using PMI and concatenates them into a single words.
268
+ * A new method `tomotopy.LDAModel.add_corpus` was added, and `tomotopy.LDAModel.infer` can receive corpus as input.
269
+ * A new module `tomotopy.coherence` was added. It provides the way to calculate coherence of the model.
270
+ * A paramter `window_size` was added to `tomotopy.label.FoRelevance`.
271
+ * An issue was fixed where NaN often occurs when training `tomotopy.HDPModel`.
272
+ * Now Python3.9 is supported.
273
+ * A dependency to py-cpuinfo was removed and the initializing of the module was improved.
274
+
264
275
  * 0.9.1 (2020-08-08)
265
276
  * Memory leaks of version 0.9.0 was fixed.
266
277
  * `tomotopy.CTModel.summary()` was fixed.
@@ -380,3 +391,21 @@ History
380
391
 
381
392
  * 0.1.0 (2019-05-12)
382
393
  * First version of **tomotopy**
394
+
395
+ Bindings for Other Languages
396
+ ------------------------------
397
+
398
+ * Ruby: https://github.com/ankane/tomoto
399
+
400
+ Bundled Libraries and Their License
401
+ ------------------------------------
402
+ * Eigen:
403
+ This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
404
+ A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
405
+ The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
406
+
407
+ * EigenRand: `MIT License
408
+ <licenses_bundled/EigenRand>`_
409
+
410
+ * Mapbox Variant: `BSD License
411
+ <licenses_bundled/MapboxVariant>`_
@@ -5,161 +5,74 @@
5
5
 
6
6
  using namespace tomoto::label;
7
7
 
8
- namespace std
8
+ class DocWrapper
9
9
  {
10
- template <>
11
- struct hash<pair<tomoto::Vid, tomoto::Vid>>
10
+ const tomoto::DocumentBase* doc;
11
+ public:
12
+ DocWrapper(const tomoto::DocumentBase* _doc = nullptr)
13
+ : doc{ _doc }
12
14
  {
13
- size_t operator()(const pair<tomoto::Vid, tomoto::Vid>& k) const
14
- {
15
- return hash<tomoto::Vid>{}(k.first) ^ hash<tomoto::Vid>{}(k.second);
16
- }
17
- };
18
- }
19
-
20
- std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel * tm) const
21
- {
22
- auto& vocabFreqs = tm->getVocabCf();
23
- auto& vocabDf = tm->getVocabDf();
24
-
25
- // counting unigrams & bigrams
26
- std::unordered_map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf;
15
+ }
27
16
 
28
- for (size_t i = 0; i < tm->getNumDocs(); ++i)
17
+ size_t size() const
29
18
  {
30
- std::unordered_set<std::pair<Vid, Vid>> uniqBigram;
31
- auto doc = tm->getDoc(i);
32
- Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
33
- for (size_t j = 1; j < doc->words.size(); ++j)
34
- {
35
- Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
36
- if (vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
37
- {
38
- if (vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
39
- {
40
- bigramCnt[std::make_pair(prevWord, curWord)]++;
41
- uniqBigram.emplace(prevWord, curWord);
42
- }
43
- }
44
- prevWord = curWord;
45
- }
46
-
47
- for (auto& p : uniqBigram) bigramDf[p]++;
19
+ return doc->words.size();
48
20
  }
49
21
 
50
-
51
- // counting ngrams
52
- std::vector<TrieEx<Vid, size_t>> trieNodes;
53
-
54
- if (maxLabelLen > 2)
22
+ tomoto::Vid operator[](size_t idx) const
55
23
  {
56
- std::unordered_set<std::pair<Vid, Vid>> validPair;
57
- for (auto& p : bigramCnt)
58
- {
59
- if (p.second >= candMinCnt) validPair.emplace(p.first);
60
- }
61
-
62
- trieNodes.resize(1);
63
- auto allocNode = [&]() { return trieNodes.emplace_back(), &trieNodes.back(); };
64
-
65
- for (size_t i = 0; i < tm->getNumDocs(); ++i)
66
- {
67
- auto doc = tm->getDoc(i);
68
- if (trieNodes.capacity() < trieNodes.size() + doc->words.size() * maxLabelLen)
69
- {
70
- trieNodes.reserve(std::max(trieNodes.size() + doc->words.size() * maxLabelLen, trieNodes.capacity() * 2));
71
- }
72
-
73
- Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
74
- size_t labelLen = 0;
75
- auto node = &trieNodes[0];
76
- if (vocabFreqs[prevWord] >= candMinCnt)
77
- {
78
- node = trieNodes[0].makeNext(prevWord, allocNode);
79
- node->val++;
80
- labelLen = 1;
81
- }
82
-
83
- for (size_t j = 1; j < doc->words.size(); ++j)
84
- {
85
- Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
24
+ return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
25
+ }
26
+ };
86
27
 
87
- if (vocabFreqs[curWord] < candMinCnt)
88
- {
89
- node = &trieNodes[0];
90
- labelLen = 0;
91
- }
92
- else
93
- {
94
- if (labelLen >= maxLabelLen)
95
- {
96
- node = node->getFail();
97
- labelLen--;
98
- }
28
+ class DocIterator
29
+ {
30
+ const tomoto::ITopicModel* tm;
31
+ size_t idx;
32
+ public:
33
+ DocIterator(const tomoto::ITopicModel* _tm = nullptr, size_t _idx = 0)
34
+ : tm{ _tm }, idx{ _idx }
35
+ {
36
+ }
99
37
 
100
- if (validPair.count(std::make_pair(prevWord, curWord)))
101
- {
102
- auto nnode = node->makeNext(curWord, allocNode);
103
- node = nnode;
104
- do
105
- {
106
- nnode->val++;
107
- } while (nnode = nnode->getFail());
108
- labelLen++;
109
- }
110
- else
111
- {
112
- node = trieNodes[0].makeNext(curWord, allocNode);
113
- node->val++;
114
- labelLen = 1;
115
- }
116
- }
117
- prevWord = curWord;
118
- }
119
- }
38
+ DocWrapper operator*() const
39
+ {
40
+ return { tm->getDoc(idx) };
120
41
  }
121
42
 
122
- // calculating PMIs
123
- std::vector<Candidate> candidates;
124
- for (auto& p : bigramCnt)
43
+ DocIterator& operator++()
125
44
  {
126
- auto& bigram = p.first;
127
- if (p.second < candMinCnt) continue;
128
- if (bigramDf[bigram] < candMinDf) continue;
129
- auto pmi = std::log(p.second * (float)tm->getN()
130
- / vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
131
- if (pmi <= 0) continue;
132
- candidates.emplace_back(pmi, bigram.first, bigram.second);
45
+ ++idx;
46
+ return *this;
133
47
  }
134
48
 
135
- if (maxLabelLen > 2)
49
+ bool operator==(const DocIterator& o) const
136
50
  {
137
- std::vector<Vid> rkeys;
138
- trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
139
- {
140
- if (rkeys.size() <= 2 || node->val < candMinCnt) return;
141
- float n = (float)tm->getN();
142
- auto pmi = node->val / n;
143
- for (auto k : rkeys)
144
- {
145
- pmi *= n / vocabFreqs[k];
146
- }
147
- pmi = std::log(pmi);
148
- candidates.emplace_back(pmi, rkeys);
149
- }, rkeys);
51
+ return tm == o.tm && idx == o.idx;
150
52
  }
151
53
 
152
- std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
54
+ bool operator!=(const DocIterator& o) const
153
55
  {
154
- return a.score > b.score;
155
- });
156
- if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
56
+ return tm != o.tm || idx != o.idx;
57
+ }
58
+ };
157
59
 
158
- for (size_t i = 0; i < vocabDf.size(); ++i)
60
+ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) const
61
+ {
62
+ auto& vocabFreqs = tm->getVocabCf();
63
+ auto& vocabDf = tm->getVocabDf();
64
+ auto candidates = extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
65
+ vocabFreqs, vocabDf,
66
+ candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, -99999.f
67
+ );
68
+ if (minLabelLen <= 1)
159
69
  {
160
- if (vocabFreqs[i] < candMinCnt) continue;
161
- if (vocabDf[i] < candMinDf) continue;
162
- candidates.emplace_back(0.f, i);
70
+ for (size_t i = 0; i < vocabDf.size(); ++i)
71
+ {
72
+ if (vocabFreqs[i] < candMinCnt) continue;
73
+ if (vocabDf[i] < candMinDf) continue;
74
+ candidates.emplace_back(0.f, i);
75
+ }
163
76
  }
164
77
  return candidates;
165
78
  }
@@ -172,8 +85,7 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
172
85
  auto node = root;
173
86
  for (size_t j = 0; j < doc->words.size(); ++j)
174
87
  {
175
- size_t t = doc->wOrder.empty() ? j : doc->wOrder[j];
176
- tomoto::Vid curWord = doc->words[t];
88
+ tomoto::Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
177
89
  if (curWord < tm->getV()) bdf[curWord] = 1;
178
90
  auto nnode = node->getNext(curWord);
179
91
  while (!nnode)
@@ -191,16 +103,15 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
191
103
  // the matched candidate is found
192
104
  if (nnode->val && nnode->val != (size_t)-1)
193
105
  {
194
- auto& c = candidates[nnode->val - 1];
195
106
  tomoto::OptionalLock<_lock> lock{ mtx[(nnode->val - 1) % (pool ? pool->getNumWorkers() : 1)] };
107
+ auto& c = candidates[nnode->val - 1];
196
108
  if (c.name.empty() && !doc->origWordPos.empty())
197
109
  {
198
110
  size_t start = doc->origWordPos[j + 1 - c.w.size()];
199
111
  size_t end = doc->origWordPos[j] + doc->origWordLen[j];
200
112
  c.names[doc->rawStr.substr(start, end - start)]++;
201
113
  }
202
- auto& docIds = c.docIds;
203
- if (docIds.empty() || docIds.back() != docId) docIds.emplace_back(docId);
114
+ c.docIds.emplace(docId);
204
115
  }
205
116
  } while (nnode = nnode->getFail());
206
117
  }
@@ -268,7 +179,22 @@ void FoRelevance::estimateContexts()
268
179
  wordTopicDist.col(i) = Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ dist.data(), (Eigen::Index)dist.size() };
269
180
  }
270
181
 
271
- auto calcScores = [&](CandidateEx& c)
182
+ size_t totDocCnt = 0;
183
+ if (windowSize == (size_t)-1)
184
+ {
185
+ totDocCnt = tm->getNumDocs();
186
+ }
187
+ else
188
+ {
189
+ for (size_t i = 0; i < tm->getNumDocs(); ++i)
190
+ {
191
+ size_t s = tm->getDoc(i)->words.size();
192
+ if (s <= windowSize) totDocCnt += 1;
193
+ else totDocCnt += s - windowSize + 1;
194
+ }
195
+ }
196
+
197
+ auto calcScores = [&](CandidateEx& c, size_t windowSize)
272
198
  {
273
199
  if (c.docIds.size() < candMinDf) return;
274
200
  if (c.name.empty() && !c.names.empty())
@@ -284,20 +210,80 @@ void FoRelevance::estimateContexts()
284
210
  }
285
211
  }
286
212
 
213
+ size_t docCnt = 0;
287
214
  Eigen::Matrix<Float, -1, 1> wcPMI = Eigen::Matrix<Float, -1, 1>::Zero(this->tm->getV());
288
215
  for (auto& docId : c.docIds)
289
216
  {
290
217
  thread_local Eigen::VectorXi bdf(this->tm->getV());
291
218
  bdf.setZero();
292
219
  auto doc = this->tm->getDoc(docId);
293
- for (size_t i = 0; i < doc->words.size(); ++i)
220
+ if (doc->words.size() <= windowSize)
294
221
  {
295
- if (doc->words[i] < this->tm->getV()) bdf[doc->words[i]] = 1;
222
+ for (size_t i = 0; i < doc->words.size(); ++i)
223
+ {
224
+ if (doc->words[i] < this->tm->getV()) bdf[doc->words[i]] = 1;
225
+ }
226
+ docCnt++;
227
+ wcPMI += bdf.template cast<Float>();
228
+ }
229
+ else
230
+ {
231
+ auto wit = c.w.begin();
232
+ std::deque<size_t> wpos;
233
+ for (size_t i = 0; i < windowSize; ++i)
234
+ {
235
+ Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
236
+ if (word < this->tm->getV()) bdf[word]++;
237
+
238
+ if (word == *wit)
239
+ {
240
+ if (++wit == c.w.end())
241
+ {
242
+ wpos.emplace_back(i + 1);
243
+ wit = c.w.begin();
244
+ }
245
+ }
246
+ else if (word == c.w[0]) wit = c.w.begin() + 1;
247
+ else wit = c.w.begin();
248
+ }
249
+ if (!wpos.empty())
250
+ {
251
+ docCnt++;
252
+ wcPMI += Eigen::bool2float(bdf.array()).matrix();
253
+ }
254
+
255
+ for (size_t i = windowSize; i < doc->words.size(); ++i)
256
+ {
257
+ Vid oword = doc->words[doc->wOrder.empty() ? (i - windowSize) : doc->wOrder[i - windowSize]];
258
+ Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
259
+ if (oword < this->tm->getV()) bdf[oword]--;
260
+ if (word < this->tm->getV()) bdf[word]++;
261
+ if (!wpos.empty() && wpos.front() - c.w.size() <= i - windowSize)
262
+ {
263
+ wpos.pop_front();
264
+ }
265
+
266
+ if (word == *wit)
267
+ {
268
+ if (++wit == c.w.end())
269
+ {
270
+ wpos.emplace_back(i + 1);
271
+ wit = c.w.begin();
272
+ }
273
+ }
274
+ else if (word == c.w[0]) wit = c.w.begin() + 1;
275
+ else wit = c.w.begin();
276
+
277
+ if (!wpos.empty())
278
+ {
279
+ docCnt++;
280
+ wcPMI += Eigen::bool2float(bdf.array()).matrix();
281
+ }
282
+ }
296
283
  }
297
- wcPMI += bdf.cast<Float>();
298
284
  }
299
285
  c.scores = wordTopicDist.transpose() *
300
- ((wcPMI.array() + smoothing) * this->tm->getNumDocs() / c.docIds.size() / df.cast<Float>()).log().matrix();
286
+ ((wcPMI.array() + smoothing) * totDocCnt / docCnt / df.cast<Float>()).log().matrix();
301
287
  };
302
288
 
303
289
  if (pool)
@@ -311,7 +297,7 @@ void FoRelevance::estimateContexts()
311
297
  {
312
298
  for (size_t i = g; i < candidates.size(); i += groups)
313
299
  {
314
- calcScores(candidates[i]);
300
+ calcScores(candidates[i], windowSize);
315
301
  }
316
302
  }, g));
317
303
  }
@@ -321,7 +307,7 @@ void FoRelevance::estimateContexts()
321
307
  {
322
308
  for (auto& c : candidates)
323
309
  {
324
- calcScores(c);
310
+ calcScores(c, windowSize);
325
311
  }
326
312
  }
327
313