tomoto 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -3
  5. data/ext/tomoto/ext.cpp +34 -9
  6. data/ext/tomoto/extconf.rb +2 -1
  7. data/lib/tomoto/dmr.rb +1 -1
  8. data/lib/tomoto/gdmr.rb +1 -1
  9. data/lib/tomoto/version.rb +1 -1
  10. data/vendor/tomotopy/LICENSE +1 -1
  11. data/vendor/tomotopy/README.kr.rst +32 -3
  12. data/vendor/tomotopy/README.rst +30 -1
  13. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
  14. data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
  15. data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
  16. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
  17. data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
  18. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
  19. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
  20. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
  21. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
  22. data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
  23. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
  24. data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
  25. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
  26. data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
  27. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
  28. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
  29. data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
  30. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
  31. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
  32. data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
  33. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
  34. data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
  35. data/vendor/tomotopy/src/Utils/math.h +1 -1
  36. data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
  37. data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
  38. data/vendor/variant/LICENSE +25 -0
  39. data/vendor/variant/LICENSE_1_0.txt +23 -0
  40. data/vendor/variant/README.md +102 -0
  41. data/vendor/variant/include/mapbox/optional.hpp +74 -0
  42. data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
  43. data/vendor/variant/include/mapbox/variant.hpp +974 -0
  44. data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
  45. metadata +15 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dd4c36ff621f73c38bb066694a932f0a682c18591ddf05a9a0764bea0b6e4430
4
- data.tar.gz: 551e56c4bc17fb5a3a0aeac0db055960fcc5e45bf097bf88c7cbf9046f958e7d
3
+ metadata.gz: 370ff6a569a6e476137cda0db349cf1b22ff0e7d54b0df67500ffbbe46846a86
4
+ data.tar.gz: 8df2fb740a85f8a51485e992dfacd6bef4d3c185d8d08369c72d93b9bb412f6d
5
5
  SHA512:
6
- metadata.gz: 565a91d0bb6d48142f38dc3d9e798ddb99bf41fda32762295362075fba972eea6b56b6bde126eab74677eba5fd525581b68c5efa73361a46fcb0b2796ab63684
7
- data.tar.gz: 415193e4eb6adbe5dce05328aadf9acb91f4acc50951484183a956455d7336f93961fe145465b1eeffaae78dad37ee1452defe832514c72b3c032860ed433cc8
6
+ metadata.gz: 17abc53d6923a1fb24bfd040967944884cd7e1595ff4fb82c7af46a4bfae858875b2819b7e05fd2ff92e7fe17eec1350c551aa98bf88eedb344cde00a780f32a
7
+ data.tar.gz: 8f504f89679cf072fee04ac5b0319e164a509d140e65dd6f1c77e6fc9ecef252ce4ece0ac80087d0d69ddccf2a5b002910db58bb5212c4c2c7fa5d206b60d422
@@ -1,3 +1,7 @@
1
+ ## 0.1.3 (2020-12-19)
2
+
3
+ - Updated tomoto to 0.10.0
4
+
1
5
  ## 0.1.2 (2020-10-10)
2
6
 
3
7
  - Added `summary` method
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2019
3
+ Copyright (c) 2019, bab2min
4
4
  Copyright (c) 2020 Andrew Kane
5
5
 
6
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  :tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
4
4
 
5
- [![Build Status](https://travis-ci.org/ankane/tomoto.svg?branch=master)](https://travis-ci.org/ankane/tomoto)
5
+ [![Build Status](https://github.com/ankane/tomoto/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tomoto/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -19,7 +19,7 @@ It can take 10-20 minutes to compile the extension.
19
19
  Train a model
20
20
 
21
21
  ```ruby
22
- model = Tomoto::LDA.new(k: 3)
22
+ model = Tomoto::LDA.new(k: 2)
23
23
  model.add_doc("text from document one")
24
24
  model.add_doc("text from document two")
25
25
  model.add_doc("text from document three")
@@ -98,7 +98,7 @@ If a method or option you need isn’t supported, feel free to open an issue.
98
98
  ## Examples
99
99
 
100
100
  - [LDA](examples/lda_basic.rb)
101
- - [HDP](examples/hdp.rb)
101
+ - [HDP](examples/hdp_basic.rb)
102
102
 
103
103
  ## Tokenization
104
104
 
@@ -96,6 +96,12 @@ std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
96
96
  return res;
97
97
  }
98
98
 
99
+ tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
100
+ tomoto::RawDoc doc;
101
+ doc.rawWords = words;
102
+ return doc;
103
+ }
104
+
99
105
  extern "C"
100
106
  void Init_ext()
101
107
  {
@@ -126,7 +132,7 @@ void Init_ext()
126
132
  .define_method(
127
133
  "_add_doc",
128
134
  *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
129
- self.addDoc(words);
135
+ self.addDoc(buildDoc(words));
130
136
  })
131
137
  .define_method(
132
138
  "alpha",
@@ -379,8 +385,10 @@ void Init_ext()
379
385
  })
380
386
  .define_method(
381
387
  "_add_doc",
382
- *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<std::string> metadata) {
383
- self.addDoc(words, metadata);
388
+ *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::string metadata) {
389
+ auto doc = buildDoc(words);
390
+ doc.misc["metadata"] = metadata;
391
+ self.addDoc(doc);
384
392
  })
385
393
  .define_method(
386
394
  "alpha_epsilon",
@@ -433,8 +441,10 @@ void Init_ext()
433
441
  })
434
442
  .define_method(
435
443
  "_add_doc",
436
- *[](tomoto::IDTModel& self, std::vector<std::string> words, size_t timepoint) {
437
- self.addDoc(words, timepoint);
444
+ *[](tomoto::IDTModel& self, std::vector<std::string> words, uint32_t timepoint) {
445
+ auto doc = buildDoc(words);
446
+ doc.misc["timepoint"] = timepoint;
447
+ self.addDoc(doc);
438
448
  })
439
449
  .define_method(
440
450
  "lr_a",
@@ -489,6 +499,13 @@ void Init_ext()
489
499
  }
490
500
  return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
491
501
  })
502
+ .define_method(
503
+ "_add_doc",
504
+ *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<tomoto::Float> metadata) {
505
+ auto doc = buildDoc(words);
506
+ doc.misc["metadata"] = metadata;
507
+ self.addDoc(doc);
508
+ })
492
509
  .define_method(
493
510
  "degrees",
494
511
  *[](tomoto::IGDMRModel& self) {
@@ -643,7 +660,9 @@ void Init_ext()
643
660
  .define_method(
644
661
  "_add_doc",
645
662
  *[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
646
- self.addDoc(words, delimiter);
663
+ auto doc = buildDoc(words);
664
+ doc.misc["delimiter"] = delimiter;
665
+ self.addDoc(doc);
647
666
  })
648
667
  .define_method(
649
668
  "alpha_g",
@@ -708,7 +727,9 @@ void Init_ext()
708
727
  .define_method(
709
728
  "_add_doc",
710
729
  *[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
711
- self.addDoc(words, labels);
730
+ auto doc = buildDoc(words);
731
+ doc.misc["labels"] = labels;
732
+ self.addDoc(doc);
712
733
  })
713
734
  .define_method(
714
735
  "topics_per_label",
@@ -728,7 +749,9 @@ void Init_ext()
728
749
  .define_method(
729
750
  "_add_doc",
730
751
  *[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
731
- self.addDoc(words, labels);
752
+ auto doc = buildDoc(words);
753
+ doc.misc["labels"] = labels;
754
+ self.addDoc(doc);
732
755
  })
733
756
  .define_method(
734
757
  "latent_topics",
@@ -753,7 +776,9 @@ void Init_ext()
753
776
  .define_method(
754
777
  "_add_doc",
755
778
  *[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
756
- self.addDoc(words, y);
779
+ auto doc = buildDoc(words);
780
+ doc.misc["y"] = y;
781
+ self.addDoc(doc);
757
782
  })
758
783
  .define_method(
759
784
  "f",
@@ -23,9 +23,10 @@ ext = File.expand_path(".", __dir__)
23
23
  tomoto = File.expand_path("../../vendor/tomotopy/src/TopicModel", __dir__)
24
24
  eigen = File.expand_path("../../vendor/eigen", __dir__)
25
25
  eigen_rand = File.expand_path("../../vendor/EigenRand", __dir__)
26
+ variant = File.expand_path("../../vendor/variant/include", __dir__)
26
27
 
27
28
  $srcs = Dir["{#{ext},#{tomoto}}/*.cpp"]
28
- $INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand}"
29
+ $INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand} -I#{variant}"
29
30
  $VPATH << tomoto
30
31
 
31
32
  create_makefile("tomoto/ext")
@@ -9,7 +9,7 @@ module Tomoto
9
9
  end
10
10
 
11
11
  def add_doc(doc, metadata: "")
12
- _add_doc(prepare_doc(doc), [metadata])
12
+ _add_doc(prepare_doc(doc), metadata)
13
13
  end
14
14
 
15
15
  def lambdas
@@ -9,7 +9,7 @@ module Tomoto
9
9
  end
10
10
 
11
11
  def add_doc(doc, metadata: [])
12
- _add_doc(prepare_doc(doc), metadata.map(&:to_s))
12
+ _add_doc(prepare_doc(doc), metadata)
13
13
  end
14
14
  end
15
15
  end
@@ -1,3 +1,3 @@
1
1
  module Tomoto
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2019
3
+ Copyright (c) 2019, bab2min
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -35,7 +35,7 @@ tomotopy 란?
35
35
 
36
36
  더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
37
37
 
38
- tomotopy의 가장 최신버전은 0.9.1 입니다.
38
+ tomotopy의 가장 최신버전은 0.10.0 입니다.
39
39
 
40
40
  시작하기
41
41
  ---------------
@@ -255,6 +255,17 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
255
255
 
256
256
  역사
257
257
  -------
258
+ * 0.10.0 (2020-12-19)
259
+ * `tomotopy.utils.Corpus`와 `tomotopy.LDAModel.docs` 간의 인터페이스가 통일되었습니다. 이제 동일한 방법으로 코퍼스 내의 문헌들에 접근할 수 있습니다.
260
+ * `tomotopy.utils.Corpus`의 __getitem__이 개선되었습니다. int 타입 인덱싱뿐만 아니라 Iterable[int]나 slicing를 이용한 다중 인덱싱, uid를 이용한 인덱싱 등이 제공됩니다.
261
+ * `tomotopy.utils.Corpus.extract_ngrams`와 `tomotopy.utils.Corpus.concat_ngrams`이 추가되었습니다. PMI를 이용해 코퍼스 내에서 자동으로 n-gram collocation을 발견해 한 단어로 합치는 기능을 수행합니다.
262
+ * `tomotopy.LDAModel.add_corpus`가 추가되었고, `tomotopy.LDAModel.infer`가 Raw 코퍼스를 입력으로 받을 수 있게 되었습니다.
263
+ * `tomotopy.coherence` 모듈이 추가되었습니다. 생성된 토픽 모델의 coherence를 계산하는 기능을 담당합니다.
264
+ * `tomotopy.label.FoRelevance`에 window_size 파라미터가 추가되었습니다.
265
+ * `tomotopy.HDPModel` 학습 시 종종 NaN이 발생하는 문제를 해결했습니다.
266
+ * 이제 Python3.9를 지원합니다.
267
+ * py-cpuinfo에 대한 의존성이 제거되고, 모듈 로딩속도가 개선되었습니다.
268
+
258
269
  * 0.9.1 (2020-08-08)
259
270
  * 0.9.0 버전의 메모리 누수 문제가 해결되었습니다.
260
271
  * `tomotopy.CTModel.summary()`가 잘못된 결과를 출력하는 문제가 해결되었습니다.
@@ -277,7 +288,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
277
288
 
278
289
  * 0.8.2 (2020-07-14)
279
290
  * `tomotopy.DTModel.num_timepoints`와 `tomotopy.DTModel.num_docs_by_timepoint` 프로퍼티가 추가되었습니다.
280
- * `seed`가 동일해서 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.
291
+ * `seed`가 동일해도 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.
281
292
 
282
293
  * 0.8.1 (2020-06-08)
283
294
  * `tomotopy.LDAModel.used_vocabs`가 잘못된 값을 반환하는 버그가 수정되었습니다.
@@ -302,7 +313,7 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
302
313
  * 0.7.0 (2020-04-18)
303
314
  * `tomotopy.DTModel`이 추가되었습니다.
304
315
  * `tomotopy.utils.Corpus.save`가 제대로 작동하지 않는 버그가 수정되었습니다.
305
- * `tomotopy.LDAModel.get_count_vector`가 추가되었습니다.
316
+ * `tomotopy.Document.get_count_vector`가 추가되었습니다.
306
317
  * 리눅스용 바이너리가 manylinux2010 버전으로 변경되었고 이에 따른 최적화가 진행되었습니다.
307
318
 
308
319
  * 0.6.2 (2020-03-28)
@@ -373,3 +384,21 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
373
384
 
374
385
  * 0.1.0 (2019-05-12)
375
386
  * **tomotopy**의 최초 버전
387
+
388
+ 다른 언어용 바인딩
389
+ -------------------
390
+
391
+ * Ruby: https://github.com/ankane/tomoto
392
+
393
+ 포함된 라이브러리들의 라이센스
394
+ -------------------------------
395
+ * Eigen:
396
+ This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
397
+ A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
398
+ The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
399
+
400
+ * EigenRand: `MIT License
401
+ <licenses_bundled/EigenRand>`_
402
+
403
+ * Mapbox Variant: `BSD License
404
+ <licenses_bundled/MapboxVariant>`_
@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
36
36
 
37
37
  Please visit https://bab2min.github.io/tomotopy to see more information.
38
38
 
39
- The most recent version of tomotopy is 0.9.1.
39
+ The most recent version of tomotopy is 0.10.0.
40
40
 
41
41
  Getting Started
42
42
  ---------------
@@ -261,6 +261,17 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
261
261
 
262
262
  History
263
263
  -------
264
+ * 0.10.0 (2020-12-19)
265
+ * The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
266
+ * __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
267
+ * New methods `tomotopy.utils.Corpus.extract_ngrams` and `tomotopy.utils.Corpus.concat_ngrams` were added. They extracts n-gram collocations using PMI and concatenates them into a single words.
268
+ * A new method `tomotopy.LDAModel.add_corpus` was added, and `tomotopy.LDAModel.infer` can receive corpus as input.
269
+ * A new module `tomotopy.coherence` was added. It provides the way to calculate coherence of the model.
270
+ * A paramter `window_size` was added to `tomotopy.label.FoRelevance`.
271
+ * An issue was fixed where NaN often occurs when training `tomotopy.HDPModel`.
272
+ * Now Python3.9 is supported.
273
+ * A dependency to py-cpuinfo was removed and the initializing of the module was improved.
274
+
264
275
  * 0.9.1 (2020-08-08)
265
276
  * Memory leaks of version 0.9.0 was fixed.
266
277
  * `tomotopy.CTModel.summary()` was fixed.
@@ -380,3 +391,21 @@ History
380
391
 
381
392
  * 0.1.0 (2019-05-12)
382
393
  * First version of **tomotopy**
394
+
395
+ Bindings for Other Languages
396
+ ------------------------------
397
+
398
+ * Ruby: https://github.com/ankane/tomoto
399
+
400
+ Bundled Libraries and Their License
401
+ ------------------------------------
402
+ * Eigen:
403
+ This application uses the MPL2-licensed features of Eigen, a C++ template library for linear algebra.
404
+ A copy of the MPL2 license is available at https://www.mozilla.org/en-US/MPL/2.0/.
405
+ The source code of the Eigen library can be obtained at http://eigen.tuxfamily.org/.
406
+
407
+ * EigenRand: `MIT License
408
+ <licenses_bundled/EigenRand>`_
409
+
410
+ * Mapbox Variant: `BSD License
411
+ <licenses_bundled/MapboxVariant>`_
@@ -5,161 +5,74 @@
5
5
 
6
6
  using namespace tomoto::label;
7
7
 
8
- namespace std
8
+ class DocWrapper
9
9
  {
10
- template <>
11
- struct hash<pair<tomoto::Vid, tomoto::Vid>>
10
+ const tomoto::DocumentBase* doc;
11
+ public:
12
+ DocWrapper(const tomoto::DocumentBase* _doc = nullptr)
13
+ : doc{ _doc }
12
14
  {
13
- size_t operator()(const pair<tomoto::Vid, tomoto::Vid>& k) const
14
- {
15
- return hash<tomoto::Vid>{}(k.first) ^ hash<tomoto::Vid>{}(k.second);
16
- }
17
- };
18
- }
19
-
20
- std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel * tm) const
21
- {
22
- auto& vocabFreqs = tm->getVocabCf();
23
- auto& vocabDf = tm->getVocabDf();
24
-
25
- // counting unigrams & bigrams
26
- std::unordered_map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf;
15
+ }
27
16
 
28
- for (size_t i = 0; i < tm->getNumDocs(); ++i)
17
+ size_t size() const
29
18
  {
30
- std::unordered_set<std::pair<Vid, Vid>> uniqBigram;
31
- auto doc = tm->getDoc(i);
32
- Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
33
- for (size_t j = 1; j < doc->words.size(); ++j)
34
- {
35
- Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
36
- if (vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
37
- {
38
- if (vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
39
- {
40
- bigramCnt[std::make_pair(prevWord, curWord)]++;
41
- uniqBigram.emplace(prevWord, curWord);
42
- }
43
- }
44
- prevWord = curWord;
45
- }
46
-
47
- for (auto& p : uniqBigram) bigramDf[p]++;
19
+ return doc->words.size();
48
20
  }
49
21
 
50
-
51
- // counting ngrams
52
- std::vector<TrieEx<Vid, size_t>> trieNodes;
53
-
54
- if (maxLabelLen > 2)
22
+ tomoto::Vid operator[](size_t idx) const
55
23
  {
56
- std::unordered_set<std::pair<Vid, Vid>> validPair;
57
- for (auto& p : bigramCnt)
58
- {
59
- if (p.second >= candMinCnt) validPair.emplace(p.first);
60
- }
61
-
62
- trieNodes.resize(1);
63
- auto allocNode = [&]() { return trieNodes.emplace_back(), &trieNodes.back(); };
64
-
65
- for (size_t i = 0; i < tm->getNumDocs(); ++i)
66
- {
67
- auto doc = tm->getDoc(i);
68
- if (trieNodes.capacity() < trieNodes.size() + doc->words.size() * maxLabelLen)
69
- {
70
- trieNodes.reserve(std::max(trieNodes.size() + doc->words.size() * maxLabelLen, trieNodes.capacity() * 2));
71
- }
72
-
73
- Vid prevWord = doc->words[doc->wOrder.empty() ? 0 : doc->wOrder[0]];
74
- size_t labelLen = 0;
75
- auto node = &trieNodes[0];
76
- if (vocabFreqs[prevWord] >= candMinCnt)
77
- {
78
- node = trieNodes[0].makeNext(prevWord, allocNode);
79
- node->val++;
80
- labelLen = 1;
81
- }
82
-
83
- for (size_t j = 1; j < doc->words.size(); ++j)
84
- {
85
- Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
24
+ return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
25
+ }
26
+ };
86
27
 
87
- if (vocabFreqs[curWord] < candMinCnt)
88
- {
89
- node = &trieNodes[0];
90
- labelLen = 0;
91
- }
92
- else
93
- {
94
- if (labelLen >= maxLabelLen)
95
- {
96
- node = node->getFail();
97
- labelLen--;
98
- }
28
+ class DocIterator
29
+ {
30
+ const tomoto::ITopicModel* tm;
31
+ size_t idx;
32
+ public:
33
+ DocIterator(const tomoto::ITopicModel* _tm = nullptr, size_t _idx = 0)
34
+ : tm{ _tm }, idx{ _idx }
35
+ {
36
+ }
99
37
 
100
- if (validPair.count(std::make_pair(prevWord, curWord)))
101
- {
102
- auto nnode = node->makeNext(curWord, allocNode);
103
- node = nnode;
104
- do
105
- {
106
- nnode->val++;
107
- } while (nnode = nnode->getFail());
108
- labelLen++;
109
- }
110
- else
111
- {
112
- node = trieNodes[0].makeNext(curWord, allocNode);
113
- node->val++;
114
- labelLen = 1;
115
- }
116
- }
117
- prevWord = curWord;
118
- }
119
- }
38
+ DocWrapper operator*() const
39
+ {
40
+ return { tm->getDoc(idx) };
120
41
  }
121
42
 
122
- // calculating PMIs
123
- std::vector<Candidate> candidates;
124
- for (auto& p : bigramCnt)
43
+ DocIterator& operator++()
125
44
  {
126
- auto& bigram = p.first;
127
- if (p.second < candMinCnt) continue;
128
- if (bigramDf[bigram] < candMinDf) continue;
129
- auto pmi = std::log(p.second * (float)tm->getN()
130
- / vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
131
- if (pmi <= 0) continue;
132
- candidates.emplace_back(pmi, bigram.first, bigram.second);
45
+ ++idx;
46
+ return *this;
133
47
  }
134
48
 
135
- if (maxLabelLen > 2)
49
+ bool operator==(const DocIterator& o) const
136
50
  {
137
- std::vector<Vid> rkeys;
138
- trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
139
- {
140
- if (rkeys.size() <= 2 || node->val < candMinCnt) return;
141
- float n = (float)tm->getN();
142
- auto pmi = node->val / n;
143
- for (auto k : rkeys)
144
- {
145
- pmi *= n / vocabFreqs[k];
146
- }
147
- pmi = std::log(pmi);
148
- candidates.emplace_back(pmi, rkeys);
149
- }, rkeys);
51
+ return tm == o.tm && idx == o.idx;
150
52
  }
151
53
 
152
- std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
54
+ bool operator!=(const DocIterator& o) const
153
55
  {
154
- return a.score > b.score;
155
- });
156
- if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
56
+ return tm != o.tm || idx != o.idx;
57
+ }
58
+ };
157
59
 
158
- for (size_t i = 0; i < vocabDf.size(); ++i)
60
+ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) const
61
+ {
62
+ auto& vocabFreqs = tm->getVocabCf();
63
+ auto& vocabDf = tm->getVocabDf();
64
+ auto candidates = extractPMINgrams(DocIterator{ tm, 0 }, DocIterator{ tm, tm->getNumDocs() },
65
+ vocabFreqs, vocabDf,
66
+ candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates, -99999.f
67
+ );
68
+ if (minLabelLen <= 1)
159
69
  {
160
- if (vocabFreqs[i] < candMinCnt) continue;
161
- if (vocabDf[i] < candMinDf) continue;
162
- candidates.emplace_back(0.f, i);
70
+ for (size_t i = 0; i < vocabDf.size(); ++i)
71
+ {
72
+ if (vocabFreqs[i] < candMinCnt) continue;
73
+ if (vocabDf[i] < candMinDf) continue;
74
+ candidates.emplace_back(0.f, i);
75
+ }
163
76
  }
164
77
  return candidates;
165
78
  }
@@ -172,8 +85,7 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
172
85
  auto node = root;
173
86
  for (size_t j = 0; j < doc->words.size(); ++j)
174
87
  {
175
- size_t t = doc->wOrder.empty() ? j : doc->wOrder[j];
176
- tomoto::Vid curWord = doc->words[t];
88
+ tomoto::Vid curWord = doc->words[doc->wOrder.empty() ? j : doc->wOrder[j]];
177
89
  if (curWord < tm->getV()) bdf[curWord] = 1;
178
90
  auto nnode = node->getNext(curWord);
179
91
  while (!nnode)
@@ -191,16 +103,15 @@ const Eigen::ArrayXi& FoRelevance::updateContext(size_t docId, const tomoto::Doc
191
103
  // the matched candidate is found
192
104
  if (nnode->val && nnode->val != (size_t)-1)
193
105
  {
194
- auto& c = candidates[nnode->val - 1];
195
106
  tomoto::OptionalLock<_lock> lock{ mtx[(nnode->val - 1) % (pool ? pool->getNumWorkers() : 1)] };
107
+ auto& c = candidates[nnode->val - 1];
196
108
  if (c.name.empty() && !doc->origWordPos.empty())
197
109
  {
198
110
  size_t start = doc->origWordPos[j + 1 - c.w.size()];
199
111
  size_t end = doc->origWordPos[j] + doc->origWordLen[j];
200
112
  c.names[doc->rawStr.substr(start, end - start)]++;
201
113
  }
202
- auto& docIds = c.docIds;
203
- if (docIds.empty() || docIds.back() != docId) docIds.emplace_back(docId);
114
+ c.docIds.emplace(docId);
204
115
  }
205
116
  } while (nnode = nnode->getFail());
206
117
  }
@@ -268,7 +179,22 @@ void FoRelevance::estimateContexts()
268
179
  wordTopicDist.col(i) = Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ dist.data(), (Eigen::Index)dist.size() };
269
180
  }
270
181
 
271
- auto calcScores = [&](CandidateEx& c)
182
+ size_t totDocCnt = 0;
183
+ if (windowSize == (size_t)-1)
184
+ {
185
+ totDocCnt = tm->getNumDocs();
186
+ }
187
+ else
188
+ {
189
+ for (size_t i = 0; i < tm->getNumDocs(); ++i)
190
+ {
191
+ size_t s = tm->getDoc(i)->words.size();
192
+ if (s <= windowSize) totDocCnt += 1;
193
+ else totDocCnt += s - windowSize + 1;
194
+ }
195
+ }
196
+
197
+ auto calcScores = [&](CandidateEx& c, size_t windowSize)
272
198
  {
273
199
  if (c.docIds.size() < candMinDf) return;
274
200
  if (c.name.empty() && !c.names.empty())
@@ -284,20 +210,80 @@ void FoRelevance::estimateContexts()
284
210
  }
285
211
  }
286
212
 
213
+ size_t docCnt = 0;
287
214
  Eigen::Matrix<Float, -1, 1> wcPMI = Eigen::Matrix<Float, -1, 1>::Zero(this->tm->getV());
288
215
  for (auto& docId : c.docIds)
289
216
  {
290
217
  thread_local Eigen::VectorXi bdf(this->tm->getV());
291
218
  bdf.setZero();
292
219
  auto doc = this->tm->getDoc(docId);
293
- for (size_t i = 0; i < doc->words.size(); ++i)
220
+ if (doc->words.size() <= windowSize)
294
221
  {
295
- if (doc->words[i] < this->tm->getV()) bdf[doc->words[i]] = 1;
222
+ for (size_t i = 0; i < doc->words.size(); ++i)
223
+ {
224
+ if (doc->words[i] < this->tm->getV()) bdf[doc->words[i]] = 1;
225
+ }
226
+ docCnt++;
227
+ wcPMI += bdf.template cast<Float>();
228
+ }
229
+ else
230
+ {
231
+ auto wit = c.w.begin();
232
+ std::deque<size_t> wpos;
233
+ for (size_t i = 0; i < windowSize; ++i)
234
+ {
235
+ Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
236
+ if (word < this->tm->getV()) bdf[word]++;
237
+
238
+ if (word == *wit)
239
+ {
240
+ if (++wit == c.w.end())
241
+ {
242
+ wpos.emplace_back(i + 1);
243
+ wit = c.w.begin();
244
+ }
245
+ }
246
+ else if (word == c.w[0]) wit = c.w.begin() + 1;
247
+ else wit = c.w.begin();
248
+ }
249
+ if (!wpos.empty())
250
+ {
251
+ docCnt++;
252
+ wcPMI += Eigen::bool2float(bdf.array()).matrix();
253
+ }
254
+
255
+ for (size_t i = windowSize; i < doc->words.size(); ++i)
256
+ {
257
+ Vid oword = doc->words[doc->wOrder.empty() ? (i - windowSize) : doc->wOrder[i - windowSize]];
258
+ Vid word = doc->words[doc->wOrder.empty() ? i : doc->wOrder[i]];
259
+ if (oword < this->tm->getV()) bdf[oword]--;
260
+ if (word < this->tm->getV()) bdf[word]++;
261
+ if (!wpos.empty() && wpos.front() - c.w.size() <= i - windowSize)
262
+ {
263
+ wpos.pop_front();
264
+ }
265
+
266
+ if (word == *wit)
267
+ {
268
+ if (++wit == c.w.end())
269
+ {
270
+ wpos.emplace_back(i + 1);
271
+ wit = c.w.begin();
272
+ }
273
+ }
274
+ else if (word == c.w[0]) wit = c.w.begin() + 1;
275
+ else wit = c.w.begin();
276
+
277
+ if (!wpos.empty())
278
+ {
279
+ docCnt++;
280
+ wcPMI += Eigen::bool2float(bdf.array()).matrix();
281
+ }
282
+ }
296
283
  }
297
- wcPMI += bdf.cast<Float>();
298
284
  }
299
285
  c.scores = wordTopicDist.transpose() *
300
- ((wcPMI.array() + smoothing) * this->tm->getNumDocs() / c.docIds.size() / df.cast<Float>()).log().matrix();
286
+ ((wcPMI.array() + smoothing) * totDocCnt / docCnt / df.cast<Float>()).log().matrix();
301
287
  };
302
288
 
303
289
  if (pool)
@@ -311,7 +297,7 @@ void FoRelevance::estimateContexts()
311
297
  {
312
298
  for (size_t i = g; i < candidates.size(); i += groups)
313
299
  {
314
- calcScores(candidates[i]);
300
+ calcScores(candidates[i], windowSize);
315
301
  }
316
302
  }, g));
317
303
  }
@@ -321,7 +307,7 @@ void FoRelevance::estimateContexts()
321
307
  {
322
308
  for (auto& c : candidates)
323
309
  {
324
- calcScores(c);
310
+ calcScores(c, windowSize);
325
311
  }
326
312
  }
327
313