tomoto 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -3
  5. data/ext/tomoto/ext.cpp +34 -9
  6. data/ext/tomoto/extconf.rb +2 -1
  7. data/lib/tomoto/dmr.rb +1 -1
  8. data/lib/tomoto/gdmr.rb +1 -1
  9. data/lib/tomoto/version.rb +1 -1
  10. data/vendor/tomotopy/LICENSE +1 -1
  11. data/vendor/tomotopy/README.kr.rst +32 -3
  12. data/vendor/tomotopy/README.rst +30 -1
  13. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
  14. data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
  15. data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
  16. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
  17. data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
  18. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
  19. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
  20. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
  21. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
  22. data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
  23. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
  24. data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
  25. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
  26. data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
  27. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
  28. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
  29. data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
  30. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
  31. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
  32. data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
  33. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
  34. data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
  35. data/vendor/tomotopy/src/Utils/math.h +1 -1
  36. data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
  37. data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
  38. data/vendor/variant/LICENSE +25 -0
  39. data/vendor/variant/LICENSE_1_0.txt +23 -0
  40. data/vendor/variant/README.md +102 -0
  41. data/vendor/variant/include/mapbox/optional.hpp +74 -0
  42. data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
  43. data/vendor/variant/include/mapbox/variant.hpp +974 -0
  44. data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
  45. metadata +15 -7
@@ -16,12 +16,163 @@ namespace tomoto
16
16
  {
17
17
  namespace label
18
18
  {
19
+ template<typename _DocIter, typename _Freqs>
20
+ std::vector<Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
21
+ _Freqs&& vocabFreqs, _Freqs&& vocabDf,
22
+ size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore)
23
+ {
24
+ struct vvhash
25
+ {
26
+ size_t operator()(const std::pair<Vid, Vid>& k) const
27
+ {
28
+ return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
29
+ }
30
+ };
31
+
32
+ // counting unigrams & bigrams
33
+ std::unordered_map<std::pair<Vid, Vid>, size_t, vvhash> bigramCnt, bigramDf;
34
+
35
+ for(auto docIt = docBegin; docIt != docEnd; ++docIt)
36
+ {
37
+ std::unordered_set<std::pair<Vid, Vid>, vvhash> uniqBigram;
38
+ auto doc = *docIt;
39
+ Vid prevWord = doc[0];
40
+ for (size_t j = 1; j < doc.size(); ++j)
41
+ {
42
+ Vid curWord = doc[j];
43
+ if (curWord != non_vocab_id && vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
44
+ {
45
+ if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
46
+ {
47
+ bigramCnt[std::make_pair(prevWord, curWord)]++;
48
+ uniqBigram.emplace(prevWord, curWord);
49
+ }
50
+ }
51
+ prevWord = curWord;
52
+ }
53
+
54
+ for (auto& p : uniqBigram) bigramDf[p]++;
55
+ }
56
+
57
+
58
+ // counting ngrams
59
+ std::vector<TrieEx<Vid, size_t>> trieNodes;
60
+
61
+ if (maxNgrams > 2)
62
+ {
63
+ std::unordered_set<std::pair<Vid, Vid>, vvhash> validPair;
64
+ for (auto& p : bigramCnt)
65
+ {
66
+ if (p.second >= candMinCnt) validPair.emplace(p.first);
67
+ }
68
+
69
+ trieNodes.resize(1);
70
+ auto allocNode = [&]() { return trieNodes.emplace_back(), & trieNodes.back(); };
71
+
72
+ for (auto docIt = docBegin; docIt != docEnd; ++docIt)
73
+ {
74
+ auto doc = *docIt;
75
+ if (trieNodes.capacity() < trieNodes.size() + doc.size() * maxNgrams)
76
+ {
77
+ trieNodes.reserve(std::max(trieNodes.size() + doc.size() * maxNgrams, trieNodes.capacity() * 2));
78
+ }
79
+
80
+ Vid prevWord = doc[0];
81
+ size_t labelLen = 0;
82
+ auto node = &trieNodes[0];
83
+ if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt)
84
+ {
85
+ node = trieNodes[0].makeNext(prevWord, allocNode);
86
+ node->val++;
87
+ labelLen = 1;
88
+ }
89
+
90
+ for (size_t j = 1; j < doc.size(); ++j)
91
+ {
92
+ Vid curWord = doc[j];
93
+
94
+ if (curWord != non_vocab_id && vocabFreqs[curWord] < candMinCnt)
95
+ {
96
+ node = &trieNodes[0];
97
+ labelLen = 0;
98
+ }
99
+ else
100
+ {
101
+ if (labelLen >= maxNgrams)
102
+ {
103
+ node = node->getFail();
104
+ labelLen--;
105
+ }
106
+
107
+ if (validPair.count(std::make_pair(prevWord, curWord)))
108
+ {
109
+ auto nnode = node->makeNext(curWord, allocNode);
110
+ node = nnode;
111
+ do
112
+ {
113
+ nnode->val++;
114
+ } while (nnode = nnode->getFail());
115
+ labelLen++;
116
+ }
117
+ else
118
+ {
119
+ node = trieNodes[0].makeNext(curWord, allocNode);
120
+ node->val++;
121
+ labelLen = 1;
122
+ }
123
+ }
124
+ prevWord = curWord;
125
+ }
126
+ }
127
+ }
128
+
129
+ float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
130
+
131
+ // calculating PMIs
132
+ std::vector<Candidate> candidates;
133
+ for (auto& p : bigramCnt)
134
+ {
135
+ auto& bigram = p.first;
136
+ if (p.second < candMinCnt) continue;
137
+ if (bigramDf[bigram] < candMinDf) continue;
138
+ auto pmi = std::log(p.second * totN
139
+ / vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
140
+ if (pmi <= 0) continue;
141
+ candidates.emplace_back(pmi, bigram.first, bigram.second);
142
+ }
143
+
144
+ if (maxNgrams > 2)
145
+ {
146
+ std::vector<Vid> rkeys;
147
+ trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
148
+ {
149
+ if (rkeys.size() <= 2 || rkeys.size() < minNgrams || node->val < candMinCnt) return;
150
+ auto pmi = node->val / totN;
151
+ for (auto k : rkeys)
152
+ {
153
+ pmi *= totN / vocabFreqs[k];
154
+ }
155
+ pmi = std::log(pmi);
156
+ if (pmi < minScore) return;
157
+ candidates.emplace_back(pmi, rkeys);
158
+ }, rkeys);
159
+ }
160
+
161
+ std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
162
+ {
163
+ return a.score > b.score;
164
+ });
165
+ if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
166
+ return candidates;
167
+ }
168
+
169
+
19
170
  class PMIExtractor : public IExtractor
20
171
  {
21
- size_t candMinCnt, candMinDf, maxLabelLen, maxCandidates;
172
+ size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
22
173
  public:
23
- PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
24
- : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
174
+ PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
175
+ : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen}, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
25
176
  {
26
177
  }
27
178
 
@@ -33,7 +184,7 @@ namespace tomoto
33
184
  struct CandidateEx : public Candidate
34
185
  {
35
186
  std::unordered_map<std::string, size_t> names;
36
- std::vector<size_t> docIds;
187
+ std::set<size_t> docIds;
37
188
  Eigen::Array<Float, -1, 1> scores;
38
189
 
39
190
  CandidateEx()
@@ -49,6 +200,7 @@ namespace tomoto
49
200
  const ITopicModel* tm;
50
201
  size_t candMinDf;
51
202
  float smoothing, lambda, mu;
203
+ size_t windowSize;
52
204
  std::unique_ptr<ThreadPool> pool;
53
205
  std::unique_ptr<std::mutex[]> mtx;
54
206
  std::vector<CandidateEx> candidates;
@@ -63,9 +215,10 @@ namespace tomoto
63
215
  FoRelevance(const ITopicModel* _tm,
64
216
  _Iter candFirst, _Iter candEnd,
65
217
  size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f,
218
+ size_t _windowSize = (size_t)-1,
66
219
  size_t numWorkers = 0)
67
220
  : tm{ _tm }, candMinDf{ _candMinDf },
68
- smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }
221
+ smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }, windowSize{ _windowSize }
69
222
  {
70
223
  if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
71
224
  if (numWorkers > 1)
@@ -8,7 +8,7 @@ namespace tomoto
8
8
  {
9
9
  using BaseDocument = DocumentLDA<_tw>;
10
10
  using DocumentLDA<_tw>::DocumentLDA;
11
- size_t metadata = 0;
11
+ uint64_t metadata = 0;
12
12
 
13
13
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata);
14
14
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata);
@@ -23,21 +23,6 @@ namespace tomoto
23
23
  size_t seed = std::random_device{}(),
24
24
  bool scalarRng = false);
25
25
 
26
- virtual size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) = 0;
27
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) const = 0;
28
-
29
- virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
30
- const std::vector<std::string>& metadata) = 0;
31
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
32
- const std::vector<std::string>& metadata) const = 0;
33
-
34
- virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
35
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
36
- const std::vector<std::string>& metadata) = 0;
37
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
38
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
39
- const std::vector<std::string>& metadata) const = 0;
40
-
41
26
  virtual void setAlphaEps(Float _alphaEps) = 0;
42
27
  virtual Float getAlphaEps() const = 0;
43
28
  virtual void setOptimRepeat(size_t repeat) = 0;
@@ -262,63 +262,44 @@ namespace tomoto
262
262
  }
263
263
 
264
264
  template<bool _const = false>
265
- _DocType& _updateDoc(_DocType& doc, const std::vector<std::string>& metadata)
265
+ _DocType& _updateDoc(_DocType& doc, const std::string& metadata)
266
266
  {
267
- std::string metadataJoined = text::join(metadata.begin(), metadata.end(), "_");
268
267
  Vid xid;
269
268
  if (_const)
270
269
  {
271
- xid = metadataDict.toWid(metadataJoined);
270
+ xid = metadataDict.toWid(metadata);
272
271
  if (xid == (Vid)-1) throw std::invalid_argument("unknown metadata");
273
272
  }
274
273
  else
275
274
  {
276
- xid = metadataDict.add(metadataJoined);
275
+ xid = metadataDict.add(metadata);
277
276
  }
278
277
  doc.metadata = xid;
279
278
  return doc;
280
279
  }
281
280
 
282
- size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) override
281
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
283
282
  {
284
- auto doc = this->_makeDoc(words);
285
- return this->_addDoc(_updateDoc(doc, metadata));
283
+ auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
284
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::string>("metadata")));
286
285
  }
287
286
 
288
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) const override
287
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
289
288
  {
290
- auto doc = as_mutable(this)->template _makeDoc<true>(words);
291
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
289
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
290
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::string>("metadata")));
292
291
  }
293
292
 
294
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
295
- const std::vector<std::string>& metadata) override
293
+ size_t addDoc(const RawDoc& rawDoc) override
296
294
  {
297
- auto doc = this->template _makeRawDoc<false>(rawStr, tokenizer);
298
- return this->_addDoc(_updateDoc(doc, metadata));
295
+ auto doc = this->_makeFromRawDoc(rawDoc);
296
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::string>("metadata")));
299
297
  }
300
298
 
301
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
302
- const std::vector<std::string>& metadata) const override
299
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
303
300
  {
304
- auto doc = as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer);
305
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
306
- }
307
-
308
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
309
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
310
- const std::vector<std::string>& metadata) override
311
- {
312
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
313
- return this->_addDoc(_updateDoc(doc, metadata));
314
- }
315
-
316
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
317
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
318
- const std::vector<std::string>& metadata) const override
319
- {
320
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
321
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
301
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
302
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::string>("metadata")));
322
303
  }
323
304
 
324
305
  GETTER(F, size_t, F);
@@ -10,7 +10,7 @@ namespace tomoto
10
10
  using BaseDocument = DocumentLDA<_tw>;
11
11
  using DocumentLDA<_tw>::DocumentLDA;
12
12
 
13
- size_t timepoint = 0;
13
+ uint64_t timepoint = 0;
14
14
  ShareableVector<Float> eta;
15
15
  sample::AliasMethod<> aliasTable;
16
16
 
@@ -29,21 +29,6 @@ namespace tomoto
29
29
  size_t seed = std::random_device{}(),
30
30
  bool scalarRng = false);
31
31
 
32
- virtual size_t addDoc(const std::vector<std::string>& words, size_t timepoint) = 0;
33
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, size_t timepoint) const = 0;
34
-
35
- virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
36
- size_t timepoint) = 0;
37
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
38
- size_t timepoint) const = 0;
39
-
40
- virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
41
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
42
- size_t timepoint) = 0;
43
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
44
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
45
- size_t timepoint) const = 0;
46
-
47
32
  virtual size_t getT() const = 0;
48
33
  virtual std::vector<uint32_t> getNumDocsByT() const = 0;
49
34
 
@@ -468,7 +468,7 @@ namespace tomoto
468
468
  return ret;
469
469
  }
470
470
 
471
- _DocType& _updateDoc(_DocType& doc, size_t timepoint) const
471
+ _DocType& _updateDoc(_DocType& doc, uint32_t timepoint) const
472
472
  {
473
473
  if (timepoint >= T) THROW_ERROR_WITH_INFO(exception::InvalidArgument, "timepoint must < T");
474
474
  doc.timepoint = timepoint;
@@ -512,51 +512,34 @@ namespace tomoto
512
512
  {
513
513
  }
514
514
 
515
- size_t addDoc(const std::vector<std::string>& words, size_t timepoint) override
515
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
516
516
  {
517
- auto doc = this->_makeDoc(words);
518
- return this->_addDoc(_updateDoc(doc, timepoint));
517
+ auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
518
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
519
519
  }
520
520
 
521
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, size_t timepoint) const override
521
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
522
522
  {
523
- auto doc = as_mutable(this)->template _makeDoc<true>(words);
524
- return make_unique<_DocType>(_updateDoc(doc, timepoint));
523
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
524
+ return make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
525
525
  }
526
526
 
527
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
528
- size_t timepoint) override
527
+ size_t addDoc(const RawDoc& rawDoc) override
529
528
  {
530
- auto doc = this->template _makeRawDoc<false>(rawStr, tokenizer);
531
- return this->_addDoc(_updateDoc(doc, timepoint));
529
+ auto doc = this->_makeFromRawDoc(rawDoc);
530
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
532
531
  }
533
532
 
534
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
535
- size_t timepoint) const override
533
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
536
534
  {
537
- auto doc = as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer);
538
- return make_unique<_DocType>(_updateDoc(doc, timepoint));
539
- }
540
-
541
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
542
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
543
- size_t timepoint) override
544
- {
545
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
546
- return this->_addDoc(_updateDoc(doc, timepoint));
547
- }
548
-
549
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
550
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
551
- size_t timepoint) const override
552
- {
553
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
554
- return make_unique<_DocType>(_updateDoc(doc, timepoint));
535
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
536
+ return make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
555
537
  }
556
538
 
557
539
  Float getAlpha(size_t k, size_t t) const override
558
540
  {
559
- return alphas(k, t);
541
+ if (alphas.size()) return alphas(k, t);
542
+ return 0;
560
543
  }
561
544
 
562
545
  std::vector<Float> getPhi(size_t k, size_t t) const override
@@ -353,58 +353,36 @@ namespace tomoto
353
353
  }
354
354
 
355
355
  template<bool _const = false>
356
- _DocType& _updateDoc(_DocType& doc, const std::vector<std::string>& metadata) const
356
+ _DocType& _updateDoc(_DocType& doc, const std::vector<Float>& metadata) const
357
357
  {
358
358
  if (metadata.size() != degreeByF.size())
359
359
  throw std::invalid_argument{ "a length of `metadata` should be equal to a length of `degrees`" };
360
-
361
- std::transform(metadata.begin(), metadata.end(), back_inserter(doc.metadataOrg), [](const std::string& w)
362
- {
363
- return std::stof(w);
364
- });
360
+ doc.metadataOrg = metadata;
365
361
  return doc;
366
362
  }
367
363
 
368
- size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) override
369
- {
370
- auto doc = this->_makeDoc(words);
371
- return this->_addDoc(_updateDoc(doc, metadata));
372
- }
373
-
374
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) const override
364
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
375
365
  {
376
- auto doc = as_mutable(this)->template _makeDoc<true>(words);
377
- return make_unique<_DocType>(_updateDoc(doc, metadata));
366
+ auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
367
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
378
368
  }
379
369
 
380
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
381
- const std::vector<std::string>& metadata) override
370
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
382
371
  {
383
- auto doc = this->template _makeRawDoc<false>(rawStr, tokenizer);
384
- return this->_addDoc(_updateDoc(doc, metadata));
372
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
373
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
385
374
  }
386
375
 
387
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
388
- const std::vector<std::string>& metadata) const override
376
+ size_t addDoc(const RawDoc& rawDoc) override
389
377
  {
390
- auto doc = as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer);
391
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
378
+ auto doc = this->_makeFromRawDoc(rawDoc);
379
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
392
380
  }
393
381
 
394
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
395
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
396
- const std::vector<std::string>& metadata) override
382
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
397
383
  {
398
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
399
- return this->_addDoc(_updateDoc(doc, metadata));
400
- }
401
-
402
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
403
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
404
- const std::vector<std::string>& metadata) const override
405
- {
406
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
407
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
384
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
385
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
408
386
  }
409
387
 
410
388
  std::vector<Float> getTopicsByDoc(const _DocType& doc) const
@@ -428,7 +406,10 @@ namespace tomoto
428
406
  std::vector<Float> getLambdaByTopic(Tid tid) const override
429
407
  {
430
408
  std::vector<Float> ret(this->F);
431
- for (size_t f = 0; f < this->F; ++f) ret[f] = this->lambda.row(tid)[f];
409
+ if (this->lambda.size())
410
+ {
411
+ Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ ret.data(), (Eigen::Index)ret.size() } = this->lambda.row(tid);
412
+ }
432
413
  return ret;
433
414
  }
434
415