tomoto 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -3
  5. data/ext/tomoto/ext.cpp +34 -9
  6. data/ext/tomoto/extconf.rb +2 -1
  7. data/lib/tomoto/dmr.rb +1 -1
  8. data/lib/tomoto/gdmr.rb +1 -1
  9. data/lib/tomoto/version.rb +1 -1
  10. data/vendor/tomotopy/LICENSE +1 -1
  11. data/vendor/tomotopy/README.kr.rst +32 -3
  12. data/vendor/tomotopy/README.rst +30 -1
  13. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
  14. data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
  15. data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
  16. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
  17. data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
  18. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
  19. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
  20. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
  21. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
  22. data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
  23. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
  24. data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
  25. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
  26. data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
  27. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
  28. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
  29. data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
  30. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
  31. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
  32. data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
  33. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
  34. data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
  35. data/vendor/tomotopy/src/Utils/math.h +1 -1
  36. data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
  37. data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
  38. data/vendor/variant/LICENSE +25 -0
  39. data/vendor/variant/LICENSE_1_0.txt +23 -0
  40. data/vendor/variant/README.md +102 -0
  41. data/vendor/variant/include/mapbox/optional.hpp +74 -0
  42. data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
  43. data/vendor/variant/include/mapbox/variant.hpp +974 -0
  44. data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
  45. metadata +15 -7
@@ -84,12 +84,12 @@ namespace tomoto
84
84
  {
85
85
  const size_t V = this->realV;
86
86
  size_t pos;
87
- for (pos = 0; pos < ld.numTableByTopic.size(); ++pos)
87
+ for (pos = 0; pos < (size_t)ld.numTableByTopic.size(); ++pos)
88
88
  {
89
89
  if (!ld.numTableByTopic[pos]) break;
90
90
  }
91
91
 
92
- if (pos >= ld.numByTopic.size())
92
+ if (pos >= (size_t)ld.numByTopic.size())
93
93
  {
94
94
  size_t oldSize = ld.numByTopic.size(), newSize = pos + 1;
95
95
  ld.numTableByTopic.conservativeResize(newSize);
@@ -302,8 +302,7 @@ namespace tomoto
302
302
  size_t oldSize = globalState.numByTopic.size();
303
303
  globalState.numByTopic.conservativeResize(K);
304
304
  globalState.numByTopic.tail(K - oldSize).setZero();
305
- globalState.numTableByTopic.conservativeResize(K);
306
- globalState.numTableByTopic.tail(K - oldSize).setZero();
305
+ globalState.numTableByTopic.resize(K);
307
306
  globalState.numByTopicWord.conservativeResize(K, Eigen::NoChange);
308
307
  globalState.numByTopicWord.block(oldSize, 0, K - oldSize, V).setZero();
309
308
  }
@@ -314,8 +313,6 @@ namespace tomoto
314
313
  size_t locK = localData[i].numByTopic.size();
315
314
  globalState.numByTopic.head(locK)
316
315
  += localData[i].numByTopic.head(locK) - tState.numByTopic.head(locK);
317
- globalState.numTableByTopic.head(locK)
318
- += localData[i].numTableByTopic.head(locK) - tState.numTableByTopic.head(locK);
319
316
  globalState.numByTopicWord.block(0, 0, locK, V)
320
317
  += localData[i].numByTopicWord.block(0, 0, locK, V) - tState.numByTopicWord.block(0, 0, locK, V);
321
318
  }
@@ -327,10 +324,16 @@ namespace tomoto
327
324
  globalState.numByTopicWord = globalState.numByTopicWord.cwiseMax(0);
328
325
  }
329
326
 
330
- globalState.totalTable = accumulate(this->docs.begin(), this->docs.end(), 0, [](size_t sum, const _DocType& doc)
327
+
328
+ globalState.numTableByTopic.setZero();
329
+ for (auto& doc : this->docs)
331
330
  {
332
- return sum + doc.getNumTable();
333
- });
331
+ for (auto& table : doc.numTopicByTable)
332
+ {
333
+ if (table) globalState.numTableByTopic[table.topic]++;
334
+ }
335
+ }
336
+ globalState.totalTable = globalState.numTableByTopic.sum();
334
337
 
335
338
  for (size_t i = 0; i < pool.getNumWorkers(); ++i)
336
339
  {
@@ -368,19 +371,13 @@ namespace tomoto
368
371
  const auto eta = this->eta;
369
372
  double ll = 0;
370
373
  // table partition ll
371
- size_t liveK = 0;
372
- for (Tid k = 0; k < K; ++k)
373
- {
374
- if (!isLiveTopic(k)) continue;
375
- ll += math::lgammaT(ld.numTableByTopic[k]);
376
- ++liveK;
377
- }
378
-
374
+ size_t liveK = (ld.numTableByTopic.array() > 0).template cast<size_t>().sum();
375
+ Eigen::ArrayXf lg = math::lgammaApprox(ld.numTableByTopic.array().template cast<Float>());
376
+ ll += (ld.numTableByTopic.array() > 0).select(lg, 0).sum();
379
377
  ll += liveK * log(gamma) - math::lgammaT(ld.totalTable + gamma) + math::lgammaT(gamma);
380
378
 
381
379
  // topic word ll
382
380
  ll += liveK * math::lgammaT(V * eta);
383
-
384
381
  for (Tid k = 0; k < K; ++k)
385
382
  {
386
383
  if (!isLiveTopic(k)) continue;
@@ -545,8 +542,7 @@ namespace tomoto
545
542
 
546
543
  for (auto& doc : this->docs)
547
544
  {
548
- auto d = lda->_makeDoc(std::vector<std::string>{});
549
- for(auto w : doc.words) d.words.emplace_back(w);
545
+ auto d = lda->_makeFromRawDoc(doc);
550
546
  lda->_addDoc(d);
551
547
  }
552
548
 
@@ -101,7 +101,7 @@ namespace tomoto
101
101
  void addPathOne()
102
102
  {
103
103
  NCRPNode* node = this;
104
- for (size_t i = 0; i <= level; ++i)
104
+ for (size_t i = 0; i <= (size_t)level; ++i)
105
105
  {
106
106
  ++node->numCustomers;
107
107
  node = node->getParent();
@@ -132,7 +132,7 @@ namespace tomoto
132
132
  {
133
133
  size_t idx = node - nodes.data();
134
134
  const Float pNewNode = _MakeNewPath ? log(gamma / (node->numCustomers + gamma)) : -INFINITY;
135
- nodeLikelihoods[idx] = weight + ((node->level < levelDepth - 1) ? pNewNode : 0);
135
+ nodeLikelihoods[idx] = weight + (((size_t)node->level < levelDepth - 1) ? pNewNode : 0);
136
136
  for(auto * child = node->getChild(); child; child = child->getSibling())
137
137
  {
138
138
  updateNodeLikelihood(gamma, levelDepth, child, weight + log(child->numCustomers / (node->numCustomers + gamma)));
@@ -279,7 +279,7 @@ namespace tomoto
279
279
  nodes[idx].level = l;
280
280
  }
281
281
 
282
- if (ld.numByTopic.size() < nodes.size())
282
+ if ((size_t)ld.numByTopic.size() < nodes.size())
283
283
  {
284
284
  size_t oldSize = ld.numByTopic.rows();
285
285
  size_t newSize = std::max(nodes.size(), ((oldSize + oldSize / 2 + 7) / 8) * 8);
@@ -117,17 +117,6 @@ namespace tomoto
117
117
  Float _alpha = 0.1, Float _eta = 0.01, size_t seed = std::random_device{}(),
118
118
  bool scalarRng = false);
119
119
 
120
- virtual size_t addDoc(const std::vector<std::string>& words) = 0;
121
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words) const = 0;
122
-
123
- virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) = 0;
124
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) const = 0;
125
-
126
- virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
127
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) = 0;
128
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
129
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) const = 0;
130
-
131
120
  virtual TermWeight getTermWeight() const = 0;
132
121
  virtual size_t getOptimInterval() const = 0;
133
122
  virtual void setOptimInterval(size_t) = 0;
@@ -609,7 +609,7 @@ namespace tomoto
609
609
 
610
610
  double getLL() const
611
611
  {
612
- return static_cast<const DerivedClass*>(this)->template getLLDocs<>(this->docs.begin(), this->docs.end())
612
+ return static_cast<const DerivedClass*>(this)->getLLDocs(this->docs.begin(), this->docs.end())
613
613
  + static_cast<const DerivedClass*>(this)->getLLRest(this->globalState);
614
614
  }
615
615
 
@@ -898,36 +898,24 @@ namespace tomoto
898
898
  burnIn = iteration;
899
899
  }
900
900
 
901
- size_t addDoc(const std::vector<std::string>& words) override
901
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
902
902
  {
903
- return this->_addDoc(this->_makeDoc(words));
903
+ return this->_addDoc(this->template _makeFromRawDoc<false>(rawDoc, tokenizer));
904
904
  }
905
905
 
906
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words) const override
906
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
907
907
  {
908
- return make_unique<_DocType>(as_mutable(this)->template _makeDoc<true>(words));
908
+ return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer));
909
909
  }
910
910
 
911
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) override
911
+ size_t addDoc(const RawDoc& rawDoc) override
912
912
  {
913
- return this->_addDoc(this->template _makeRawDoc<false>(rawStr, tokenizer));
913
+ return this->_addDoc(this->_makeFromRawDoc(rawDoc));
914
914
  }
915
915
 
916
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) const override
916
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
917
917
  {
918
- return make_unique<_DocType>(as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer));
919
- }
920
-
921
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
922
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) override
923
- {
924
- return this->_addDoc(this->_makeRawDoc(rawStr, words, pos, len));
925
- }
926
-
927
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
928
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) const override
929
- {
930
- return make_unique<_DocType>(this->_makeRawDoc(rawStr, words, pos, len));
918
+ return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc));
931
919
  }
932
920
 
933
921
  void setWordPrior(const std::string& word, const std::vector<Float>& priors) override
@@ -23,21 +23,6 @@ namespace tomoto
23
23
  Float alpha = 0.1, Float eta = 0.01, size_t seed = std::random_device{}(),
24
24
  bool scalarRng = false);
25
25
 
26
- virtual size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& label) = 0;
27
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& label) const = 0;
28
-
29
- virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
30
- const std::vector<std::string>& label) = 0;
31
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
32
- const std::vector<std::string>& label) const = 0;
33
-
34
- virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
35
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
36
- const std::vector<std::string>& label) = 0;
37
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
38
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
39
- const std::vector<std::string>& label) const = 0;
40
-
41
26
  virtual const Dictionary& getTopicLabelDict() const = 0;
42
27
 
43
28
  virtual size_t getNumTopicsPerLabel() const = 0;
@@ -144,46 +144,28 @@ namespace tomoto
144
144
  return doc;
145
145
  }
146
146
 
147
- size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& labels) override
147
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
148
148
  {
149
- auto doc = this->_makeDoc(words);
150
- return this->_addDoc(_updateDoc(doc, labels));
149
+ auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
150
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
151
151
  }
152
152
 
153
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& labels) const override
153
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
154
154
  {
155
- auto doc = as_mutable(this)->template _makeDoc<true>(words);
156
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
155
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
156
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
157
157
  }
158
158
 
159
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
160
- const std::vector<std::string>& labels) override
159
+ size_t addDoc(const RawDoc& rawDoc) override
161
160
  {
162
- auto doc = this->template _makeRawDoc<false>(rawStr, tokenizer);
163
- return this->_addDoc(_updateDoc(doc, labels));
161
+ auto doc = this->_makeFromRawDoc(rawDoc);
162
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
164
163
  }
165
164
 
166
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
167
- const std::vector<std::string>& labels) const override
165
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
168
166
  {
169
- auto doc = as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer);
170
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
171
- }
172
-
173
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
174
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
175
- const std::vector<std::string>& labels) override
176
- {
177
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
178
- return this->_addDoc(_updateDoc(doc, labels));
179
- }
180
-
181
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
182
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
183
- const std::vector<std::string>& labels) const override
184
- {
185
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
186
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
167
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
168
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
187
169
  }
188
170
 
189
171
  std::vector<Float> getTopicsByDoc(const _DocType& doc) const
@@ -37,21 +37,6 @@ namespace tomoto
37
37
  Float _etaG = 0.01, Float _etaL = 0.01, Float _gamma = 0.1, size_t seed = std::random_device{}(),
38
38
  bool scalarRng = false);
39
39
 
40
- virtual size_t addDoc(const std::vector<std::string>& words, const std::string& delimiter) = 0;
41
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::string& delimiter) const = 0;
42
-
43
- virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
44
- const std::string& delimiter) = 0;
45
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
46
- const std::string& delimiter) const = 0;
47
-
48
- virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
49
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
50
- const std::string& delimiter) = 0;
51
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
52
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
53
- const std::string& delimiter) const = 0;
54
-
55
40
  virtual size_t getKL() const = 0;
56
41
  virtual size_t getT() const = 0;
57
42
  virtual Float getAlphaL() const = 0;
@@ -385,53 +385,12 @@ namespace tomoto
385
385
  if (_etaL <= 0) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong etaL value (etaL = %f)", _etaL));
386
386
  }
387
387
 
388
-
389
- template<bool _const = false>
390
- _DocType _makeDoc(const std::vector<std::string>& words, const std::string& delimiter)
391
- {
392
- _DocType doc{ 1.f };
393
- size_t numSent = 0;
394
- for (auto& w : words)
395
- {
396
- if (w == delimiter)
397
- {
398
- ++numSent;
399
- continue;
400
- }
401
-
402
- Vid id;
403
- if (_const)
404
- {
405
- id = this->dict.toWid(w);
406
- if (id == (Vid)-1) continue;
407
- }
408
- else
409
- {
410
- id = this->dict.add(w);
411
- }
412
- doc.words.emplace_back(id);
413
- doc.sents.emplace_back(numSent);
414
- }
415
- doc.numBySent.resize(doc.sents.empty() ? 0 : (doc.sents.back() + 1));
416
- return doc;
417
- }
418
-
419
- size_t addDoc(const std::vector<std::string>& words, const std::string& delimiter) override
420
- {
421
- return this->_addDoc(_makeDoc(words, delimiter));
422
- }
423
-
424
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::string& delimiter) const override
425
- {
426
- return make_unique<_DocType>(as_mutable(this)->template _makeDoc<true>(words, delimiter));
427
- }
428
-
429
388
  template<bool _const, typename _FnTokenizer>
430
- _DocType _makeRawDoc(const std::string& rawStr, _FnTokenizer&& tokenizer, const std::string& delimiter)
389
+ _DocType _makeFromRawDoc(const RawDoc& rawDoc, _FnTokenizer&& tokenizer, const std::string& delimiter)
431
390
  {
432
- _DocType doc{ 1.f };
391
+ _DocType doc;
433
392
  size_t numSent = 0;
434
- doc.rawStr = rawStr;
393
+ doc.rawStr = rawDoc.rawStr;
435
394
  for (auto& p : tokenizer(doc.rawStr))
436
395
  {
437
396
  if (std::get<0>(p) == delimiter)
@@ -461,57 +420,85 @@ namespace tomoto
461
420
  return doc;
462
421
  }
463
422
 
464
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
465
- const std::string& delimiter)
423
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer)
466
424
  {
467
- return this->_addDoc(_makeRawDoc<false>(rawStr, tokenizer, delimiter));
425
+ return this->_addDoc(_makeFromRawDoc<false>(rawDoc, tokenizer, rawDoc.template getMisc<std::string>("delimiter")));
468
426
  }
469
427
 
470
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
471
- const std::string& delimiter) const
428
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const
472
429
  {
473
- return make_unique<_DocType>(as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer, delimiter));
430
+ return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer, rawDoc.template getMisc<std::string>("delimiter")));
474
431
  }
475
432
 
476
- _DocType _makeRawDoc(const std::string& rawStr, const std::vector<Vid>& words,
477
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len, const std::string& delimiter) const
433
+ template<bool _const = false>
434
+ _DocType _makeFromRawDoc(const RawDoc& rawDoc)
478
435
  {
479
- _DocType doc{ 1.f };
480
- doc.rawStr = rawStr;
436
+ _DocType doc;
437
+ doc.rawStr = rawDoc.rawStr;
438
+ auto delimiter = rawDoc.template getMisc<std::string>("delimiter");
481
439
  size_t numSent = 0;
482
440
  Vid delimiterId = this->dict.toWid(delimiter);
483
- for (size_t i = 0; i < words.size(); ++i)
441
+ if (!rawDoc.rawWords.empty())
484
442
  {
485
- auto& w = words[i];
486
- if (w == delimiterId)
443
+ for (size_t i = 0; i < rawDoc.rawWords.size(); ++i)
487
444
  {
488
- ++numSent;
489
- continue;
445
+ auto& w = rawDoc.rawWords[i];
446
+ if (w == delimiter)
447
+ {
448
+ ++numSent;
449
+ continue;
450
+ }
451
+
452
+ Vid id;
453
+ if (_const)
454
+ {
455
+ id = this->dict.toWid(w);
456
+ if (id == (Vid)-1) continue;
457
+ }
458
+ else
459
+ {
460
+ id = this->dict.add(w);
461
+ }
462
+ doc.words.emplace_back(id);
463
+ doc.sents.emplace_back(numSent);
464
+ if (rawDoc.rawWords.size() == rawDoc.origWordPos.size())
465
+ {
466
+ doc.origWordPos.emplace_back(rawDoc.origWordPos[i]);
467
+ doc.origWordLen.emplace_back(rawDoc.origWordLen[i]);
468
+ }
490
469
  }
491
- doc.words.emplace_back(w);
492
- doc.sents.emplace_back(numSent);
493
- if (words.size() == pos.size())
470
+ }
471
+ else if (!rawDoc.words.empty())
472
+ {
473
+ for (size_t i = 0; i < rawDoc.words.size(); ++i)
494
474
  {
495
- doc.origWordPos.emplace_back(pos[i]);
496
- doc.origWordLen.emplace_back(len[i]);
475
+ auto& w = rawDoc.words[i];
476
+ if (w == delimiterId)
477
+ {
478
+ ++numSent;
479
+ continue;
480
+ }
481
+ doc.words.emplace_back(w);
482
+ doc.sents.emplace_back(numSent);
483
+ if (rawDoc.words.size() == rawDoc.origWordPos.size())
484
+ {
485
+ doc.origWordPos.emplace_back(rawDoc.origWordPos[i]);
486
+ doc.origWordLen.emplace_back(rawDoc.origWordLen[i]);
487
+ }
497
488
  }
498
489
  }
499
490
  doc.numBySent.resize(doc.sents.empty() ? 0 : (doc.sents.back() + 1));
500
491
  return doc;
501
492
  }
502
493
 
503
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
504
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
505
- const std::string& delimiter)
494
+ size_t addDoc(const RawDoc& rawDoc)
506
495
  {
507
- return this->_addDoc(_makeRawDoc(rawStr, words, pos, len, delimiter));
496
+ return this->_addDoc(_makeFromRawDoc(rawDoc));
508
497
  }
509
498
 
510
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
511
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
512
- const std::string& delimiter) const
499
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const
513
500
  {
514
- return make_unique<_DocType>(_makeRawDoc(rawStr, words, pos, len, delimiter));
501
+ return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc));
515
502
  }
516
503
 
517
504
  void setWordPrior(const std::string& word, const std::vector<Float>& priors) override