tomoto 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -3
  5. data/ext/tomoto/ext.cpp +34 -9
  6. data/ext/tomoto/extconf.rb +2 -1
  7. data/lib/tomoto/dmr.rb +1 -1
  8. data/lib/tomoto/gdmr.rb +1 -1
  9. data/lib/tomoto/version.rb +1 -1
  10. data/vendor/tomotopy/LICENSE +1 -1
  11. data/vendor/tomotopy/README.kr.rst +32 -3
  12. data/vendor/tomotopy/README.rst +30 -1
  13. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
  14. data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
  15. data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
  16. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
  17. data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
  18. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
  19. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
  20. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
  21. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
  22. data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
  23. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
  24. data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
  25. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
  26. data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
  27. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
  28. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
  29. data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
  30. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
  31. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
  32. data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
  33. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
  34. data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
  35. data/vendor/tomotopy/src/Utils/math.h +1 -1
  36. data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
  37. data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
  38. data/vendor/variant/LICENSE +25 -0
  39. data/vendor/variant/LICENSE_1_0.txt +23 -0
  40. data/vendor/variant/README.md +102 -0
  41. data/vendor/variant/include/mapbox/optional.hpp +74 -0
  42. data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
  43. data/vendor/variant/include/mapbox/variant.hpp +974 -0
  44. data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
  45. metadata +15 -7
@@ -16,12 +16,163 @@ namespace tomoto
16
16
  {
17
17
  namespace label
18
18
  {
19
+ template<typename _DocIter, typename _Freqs>
20
+ std::vector<Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
21
+ _Freqs&& vocabFreqs, _Freqs&& vocabDf,
22
+ size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore)
23
+ {
24
+ struct vvhash
25
+ {
26
+ size_t operator()(const std::pair<Vid, Vid>& k) const
27
+ {
28
+ return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
29
+ }
30
+ };
31
+
32
+ // counting unigrams & bigrams
33
+ std::unordered_map<std::pair<Vid, Vid>, size_t, vvhash> bigramCnt, bigramDf;
34
+
35
+ for(auto docIt = docBegin; docIt != docEnd; ++docIt)
36
+ {
37
+ std::unordered_set<std::pair<Vid, Vid>, vvhash> uniqBigram;
38
+ auto doc = *docIt;
39
+ Vid prevWord = doc[0];
40
+ for (size_t j = 1; j < doc.size(); ++j)
41
+ {
42
+ Vid curWord = doc[j];
43
+ if (curWord != non_vocab_id && vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
44
+ {
45
+ if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
46
+ {
47
+ bigramCnt[std::make_pair(prevWord, curWord)]++;
48
+ uniqBigram.emplace(prevWord, curWord);
49
+ }
50
+ }
51
+ prevWord = curWord;
52
+ }
53
+
54
+ for (auto& p : uniqBigram) bigramDf[p]++;
55
+ }
56
+
57
+
58
+ // counting ngrams
59
+ std::vector<TrieEx<Vid, size_t>> trieNodes;
60
+
61
+ if (maxNgrams > 2)
62
+ {
63
+ std::unordered_set<std::pair<Vid, Vid>, vvhash> validPair;
64
+ for (auto& p : bigramCnt)
65
+ {
66
+ if (p.second >= candMinCnt) validPair.emplace(p.first);
67
+ }
68
+
69
+ trieNodes.resize(1);
70
+ auto allocNode = [&]() { return trieNodes.emplace_back(), & trieNodes.back(); };
71
+
72
+ for (auto docIt = docBegin; docIt != docEnd; ++docIt)
73
+ {
74
+ auto doc = *docIt;
75
+ if (trieNodes.capacity() < trieNodes.size() + doc.size() * maxNgrams)
76
+ {
77
+ trieNodes.reserve(std::max(trieNodes.size() + doc.size() * maxNgrams, trieNodes.capacity() * 2));
78
+ }
79
+
80
+ Vid prevWord = doc[0];
81
+ size_t labelLen = 0;
82
+ auto node = &trieNodes[0];
83
+ if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt)
84
+ {
85
+ node = trieNodes[0].makeNext(prevWord, allocNode);
86
+ node->val++;
87
+ labelLen = 1;
88
+ }
89
+
90
+ for (size_t j = 1; j < doc.size(); ++j)
91
+ {
92
+ Vid curWord = doc[j];
93
+
94
+ if (curWord != non_vocab_id && vocabFreqs[curWord] < candMinCnt)
95
+ {
96
+ node = &trieNodes[0];
97
+ labelLen = 0;
98
+ }
99
+ else
100
+ {
101
+ if (labelLen >= maxNgrams)
102
+ {
103
+ node = node->getFail();
104
+ labelLen--;
105
+ }
106
+
107
+ if (validPair.count(std::make_pair(prevWord, curWord)))
108
+ {
109
+ auto nnode = node->makeNext(curWord, allocNode);
110
+ node = nnode;
111
+ do
112
+ {
113
+ nnode->val++;
114
+ } while (nnode = nnode->getFail());
115
+ labelLen++;
116
+ }
117
+ else
118
+ {
119
+ node = trieNodes[0].makeNext(curWord, allocNode);
120
+ node->val++;
121
+ labelLen = 1;
122
+ }
123
+ }
124
+ prevWord = curWord;
125
+ }
126
+ }
127
+ }
128
+
129
+ float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
130
+
131
+ // calculating PMIs
132
+ std::vector<Candidate> candidates;
133
+ for (auto& p : bigramCnt)
134
+ {
135
+ auto& bigram = p.first;
136
+ if (p.second < candMinCnt) continue;
137
+ if (bigramDf[bigram] < candMinDf) continue;
138
+ auto pmi = std::log(p.second * totN
139
+ / vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
140
+ if (pmi <= 0) continue;
141
+ candidates.emplace_back(pmi, bigram.first, bigram.second);
142
+ }
143
+
144
+ if (maxNgrams > 2)
145
+ {
146
+ std::vector<Vid> rkeys;
147
+ trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
148
+ {
149
+ if (rkeys.size() <= 2 || rkeys.size() < minNgrams || node->val < candMinCnt) return;
150
+ auto pmi = node->val / totN;
151
+ for (auto k : rkeys)
152
+ {
153
+ pmi *= totN / vocabFreqs[k];
154
+ }
155
+ pmi = std::log(pmi);
156
+ if (pmi < minScore) return;
157
+ candidates.emplace_back(pmi, rkeys);
158
+ }, rkeys);
159
+ }
160
+
161
+ std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
162
+ {
163
+ return a.score > b.score;
164
+ });
165
+ if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
166
+ return candidates;
167
+ }
168
+
169
+
19
170
  class PMIExtractor : public IExtractor
20
171
  {
21
- size_t candMinCnt, candMinDf, maxLabelLen, maxCandidates;
172
+ size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
22
173
  public:
23
- PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
24
- : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
174
+ PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
175
+ : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen}, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
25
176
  {
26
177
  }
27
178
 
@@ -33,7 +184,7 @@ namespace tomoto
33
184
  struct CandidateEx : public Candidate
34
185
  {
35
186
  std::unordered_map<std::string, size_t> names;
36
- std::vector<size_t> docIds;
187
+ std::set<size_t> docIds;
37
188
  Eigen::Array<Float, -1, 1> scores;
38
189
 
39
190
  CandidateEx()
@@ -49,6 +200,7 @@ namespace tomoto
49
200
  const ITopicModel* tm;
50
201
  size_t candMinDf;
51
202
  float smoothing, lambda, mu;
203
+ size_t windowSize;
52
204
  std::unique_ptr<ThreadPool> pool;
53
205
  std::unique_ptr<std::mutex[]> mtx;
54
206
  std::vector<CandidateEx> candidates;
@@ -63,9 +215,10 @@ namespace tomoto
63
215
  FoRelevance(const ITopicModel* _tm,
64
216
  _Iter candFirst, _Iter candEnd,
65
217
  size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f,
218
+ size_t _windowSize = (size_t)-1,
66
219
  size_t numWorkers = 0)
67
220
  : tm{ _tm }, candMinDf{ _candMinDf },
68
- smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }
221
+ smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }, windowSize{ _windowSize }
69
222
  {
70
223
  if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
71
224
  if (numWorkers > 1)
@@ -8,7 +8,7 @@ namespace tomoto
8
8
  {
9
9
  using BaseDocument = DocumentLDA<_tw>;
10
10
  using DocumentLDA<_tw>::DocumentLDA;
11
- size_t metadata = 0;
11
+ uint64_t metadata = 0;
12
12
 
13
13
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata);
14
14
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata);
@@ -23,21 +23,6 @@ namespace tomoto
23
23
  size_t seed = std::random_device{}(),
24
24
  bool scalarRng = false);
25
25
 
26
- virtual size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) = 0;
27
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) const = 0;
28
-
29
- virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
30
- const std::vector<std::string>& metadata) = 0;
31
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
32
- const std::vector<std::string>& metadata) const = 0;
33
-
34
- virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
35
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
36
- const std::vector<std::string>& metadata) = 0;
37
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
38
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
39
- const std::vector<std::string>& metadata) const = 0;
40
-
41
26
  virtual void setAlphaEps(Float _alphaEps) = 0;
42
27
  virtual Float getAlphaEps() const = 0;
43
28
  virtual void setOptimRepeat(size_t repeat) = 0;
@@ -262,63 +262,44 @@ namespace tomoto
262
262
  }
263
263
 
264
264
  template<bool _const = false>
265
- _DocType& _updateDoc(_DocType& doc, const std::vector<std::string>& metadata)
265
+ _DocType& _updateDoc(_DocType& doc, const std::string& metadata)
266
266
  {
267
- std::string metadataJoined = text::join(metadata.begin(), metadata.end(), "_");
268
267
  Vid xid;
269
268
  if (_const)
270
269
  {
271
- xid = metadataDict.toWid(metadataJoined);
270
+ xid = metadataDict.toWid(metadata);
272
271
  if (xid == (Vid)-1) throw std::invalid_argument("unknown metadata");
273
272
  }
274
273
  else
275
274
  {
276
- xid = metadataDict.add(metadataJoined);
275
+ xid = metadataDict.add(metadata);
277
276
  }
278
277
  doc.metadata = xid;
279
278
  return doc;
280
279
  }
281
280
 
282
- size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) override
281
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
283
282
  {
284
- auto doc = this->_makeDoc(words);
285
- return this->_addDoc(_updateDoc(doc, metadata));
283
+ auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
284
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::string>("metadata")));
286
285
  }
287
286
 
288
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) const override
287
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
289
288
  {
290
- auto doc = as_mutable(this)->template _makeDoc<true>(words);
291
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
289
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
290
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::string>("metadata")));
292
291
  }
293
292
 
294
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
295
- const std::vector<std::string>& metadata) override
293
+ size_t addDoc(const RawDoc& rawDoc) override
296
294
  {
297
- auto doc = this->template _makeRawDoc<false>(rawStr, tokenizer);
298
- return this->_addDoc(_updateDoc(doc, metadata));
295
+ auto doc = this->_makeFromRawDoc(rawDoc);
296
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::string>("metadata")));
299
297
  }
300
298
 
301
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
302
- const std::vector<std::string>& metadata) const override
299
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
303
300
  {
304
- auto doc = as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer);
305
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
306
- }
307
-
308
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
309
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
310
- const std::vector<std::string>& metadata) override
311
- {
312
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
313
- return this->_addDoc(_updateDoc(doc, metadata));
314
- }
315
-
316
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
317
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
318
- const std::vector<std::string>& metadata) const override
319
- {
320
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
321
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
301
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
302
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::string>("metadata")));
322
303
  }
323
304
 
324
305
  GETTER(F, size_t, F);
@@ -10,7 +10,7 @@ namespace tomoto
10
10
  using BaseDocument = DocumentLDA<_tw>;
11
11
  using DocumentLDA<_tw>::DocumentLDA;
12
12
 
13
- size_t timepoint = 0;
13
+ uint64_t timepoint = 0;
14
14
  ShareableVector<Float> eta;
15
15
  sample::AliasMethod<> aliasTable;
16
16
 
@@ -29,21 +29,6 @@ namespace tomoto
29
29
  size_t seed = std::random_device{}(),
30
30
  bool scalarRng = false);
31
31
 
32
- virtual size_t addDoc(const std::vector<std::string>& words, size_t timepoint) = 0;
33
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, size_t timepoint) const = 0;
34
-
35
- virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
36
- size_t timepoint) = 0;
37
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
38
- size_t timepoint) const = 0;
39
-
40
- virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
41
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
42
- size_t timepoint) = 0;
43
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
44
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
45
- size_t timepoint) const = 0;
46
-
47
32
  virtual size_t getT() const = 0;
48
33
  virtual std::vector<uint32_t> getNumDocsByT() const = 0;
49
34
 
@@ -468,7 +468,7 @@ namespace tomoto
468
468
  return ret;
469
469
  }
470
470
 
471
- _DocType& _updateDoc(_DocType& doc, size_t timepoint) const
471
+ _DocType& _updateDoc(_DocType& doc, uint32_t timepoint) const
472
472
  {
473
473
  if (timepoint >= T) THROW_ERROR_WITH_INFO(exception::InvalidArgument, "timepoint must < T");
474
474
  doc.timepoint = timepoint;
@@ -512,51 +512,34 @@ namespace tomoto
512
512
  {
513
513
  }
514
514
 
515
- size_t addDoc(const std::vector<std::string>& words, size_t timepoint) override
515
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
516
516
  {
517
- auto doc = this->_makeDoc(words);
518
- return this->_addDoc(_updateDoc(doc, timepoint));
517
+ auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
518
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
519
519
  }
520
520
 
521
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, size_t timepoint) const override
521
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
522
522
  {
523
- auto doc = as_mutable(this)->template _makeDoc<true>(words);
524
- return make_unique<_DocType>(_updateDoc(doc, timepoint));
523
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
524
+ return make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
525
525
  }
526
526
 
527
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
528
- size_t timepoint) override
527
+ size_t addDoc(const RawDoc& rawDoc) override
529
528
  {
530
- auto doc = this->template _makeRawDoc<false>(rawStr, tokenizer);
531
- return this->_addDoc(_updateDoc(doc, timepoint));
529
+ auto doc = this->_makeFromRawDoc(rawDoc);
530
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
532
531
  }
533
532
 
534
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
535
- size_t timepoint) const override
533
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
536
534
  {
537
- auto doc = as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer);
538
- return make_unique<_DocType>(_updateDoc(doc, timepoint));
539
- }
540
-
541
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
542
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
543
- size_t timepoint) override
544
- {
545
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
546
- return this->_addDoc(_updateDoc(doc, timepoint));
547
- }
548
-
549
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
550
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
551
- size_t timepoint) const override
552
- {
553
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
554
- return make_unique<_DocType>(_updateDoc(doc, timepoint));
535
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
536
+ return make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
555
537
  }
556
538
 
557
539
  Float getAlpha(size_t k, size_t t) const override
558
540
  {
559
- return alphas(k, t);
541
+ if (alphas.size()) return alphas(k, t);
542
+ return 0;
560
543
  }
561
544
 
562
545
  std::vector<Float> getPhi(size_t k, size_t t) const override
@@ -353,58 +353,36 @@ namespace tomoto
353
353
  }
354
354
 
355
355
  template<bool _const = false>
356
- _DocType& _updateDoc(_DocType& doc, const std::vector<std::string>& metadata) const
356
+ _DocType& _updateDoc(_DocType& doc, const std::vector<Float>& metadata) const
357
357
  {
358
358
  if (metadata.size() != degreeByF.size())
359
359
  throw std::invalid_argument{ "a length of `metadata` should be equal to a length of `degrees`" };
360
-
361
- std::transform(metadata.begin(), metadata.end(), back_inserter(doc.metadataOrg), [](const std::string& w)
362
- {
363
- return std::stof(w);
364
- });
360
+ doc.metadataOrg = metadata;
365
361
  return doc;
366
362
  }
367
363
 
368
- size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) override
369
- {
370
- auto doc = this->_makeDoc(words);
371
- return this->_addDoc(_updateDoc(doc, metadata));
372
- }
373
-
374
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) const override
364
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
375
365
  {
376
- auto doc = as_mutable(this)->template _makeDoc<true>(words);
377
- return make_unique<_DocType>(_updateDoc(doc, metadata));
366
+ auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
367
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
378
368
  }
379
369
 
380
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
381
- const std::vector<std::string>& metadata) override
370
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
382
371
  {
383
- auto doc = this->template _makeRawDoc<false>(rawStr, tokenizer);
384
- return this->_addDoc(_updateDoc(doc, metadata));
372
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
373
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
385
374
  }
386
375
 
387
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
388
- const std::vector<std::string>& metadata) const override
376
+ size_t addDoc(const RawDoc& rawDoc) override
389
377
  {
390
- auto doc = as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer);
391
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
378
+ auto doc = this->_makeFromRawDoc(rawDoc);
379
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
392
380
  }
393
381
 
394
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
395
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
396
- const std::vector<std::string>& metadata) override
382
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
397
383
  {
398
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
399
- return this->_addDoc(_updateDoc(doc, metadata));
400
- }
401
-
402
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
403
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
404
- const std::vector<std::string>& metadata) const override
405
- {
406
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
407
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
384
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
385
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
408
386
  }
409
387
 
410
388
  std::vector<Float> getTopicsByDoc(const _DocType& doc) const
@@ -428,7 +406,10 @@ namespace tomoto
428
406
  std::vector<Float> getLambdaByTopic(Tid tid) const override
429
407
  {
430
408
  std::vector<Float> ret(this->F);
431
- for (size_t f = 0; f < this->F; ++f) ret[f] = this->lambda.row(tid)[f];
409
+ if (this->lambda.size())
410
+ {
411
+ Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ ret.data(), (Eigen::Index)ret.size() } = this->lambda.row(tid);
412
+ }
432
413
  return ret;
433
414
  }
434
415