tomoto 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -3
  5. data/ext/tomoto/ext.cpp +34 -9
  6. data/ext/tomoto/extconf.rb +2 -1
  7. data/lib/tomoto/dmr.rb +1 -1
  8. data/lib/tomoto/gdmr.rb +1 -1
  9. data/lib/tomoto/version.rb +1 -1
  10. data/vendor/tomotopy/LICENSE +1 -1
  11. data/vendor/tomotopy/README.kr.rst +32 -3
  12. data/vendor/tomotopy/README.rst +30 -1
  13. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
  14. data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
  15. data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
  16. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
  17. data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
  18. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
  19. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
  20. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
  21. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
  22. data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
  23. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
  24. data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
  25. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
  26. data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
  27. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
  28. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
  29. data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
  30. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
  31. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
  32. data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
  33. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
  34. data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
  35. data/vendor/tomotopy/src/Utils/math.h +1 -1
  36. data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
  37. data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
  38. data/vendor/variant/LICENSE +25 -0
  39. data/vendor/variant/LICENSE_1_0.txt +23 -0
  40. data/vendor/variant/README.md +102 -0
  41. data/vendor/variant/include/mapbox/optional.hpp +74 -0
  42. data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
  43. data/vendor/variant/include/mapbox/variant.hpp +974 -0
  44. data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
  45. metadata +15 -7
@@ -84,12 +84,12 @@ namespace tomoto
84
84
  {
85
85
  const size_t V = this->realV;
86
86
  size_t pos;
87
- for (pos = 0; pos < ld.numTableByTopic.size(); ++pos)
87
+ for (pos = 0; pos < (size_t)ld.numTableByTopic.size(); ++pos)
88
88
  {
89
89
  if (!ld.numTableByTopic[pos]) break;
90
90
  }
91
91
 
92
- if (pos >= ld.numByTopic.size())
92
+ if (pos >= (size_t)ld.numByTopic.size())
93
93
  {
94
94
  size_t oldSize = ld.numByTopic.size(), newSize = pos + 1;
95
95
  ld.numTableByTopic.conservativeResize(newSize);
@@ -302,8 +302,7 @@ namespace tomoto
302
302
  size_t oldSize = globalState.numByTopic.size();
303
303
  globalState.numByTopic.conservativeResize(K);
304
304
  globalState.numByTopic.tail(K - oldSize).setZero();
305
- globalState.numTableByTopic.conservativeResize(K);
306
- globalState.numTableByTopic.tail(K - oldSize).setZero();
305
+ globalState.numTableByTopic.resize(K);
307
306
  globalState.numByTopicWord.conservativeResize(K, Eigen::NoChange);
308
307
  globalState.numByTopicWord.block(oldSize, 0, K - oldSize, V).setZero();
309
308
  }
@@ -314,8 +313,6 @@ namespace tomoto
314
313
  size_t locK = localData[i].numByTopic.size();
315
314
  globalState.numByTopic.head(locK)
316
315
  += localData[i].numByTopic.head(locK) - tState.numByTopic.head(locK);
317
- globalState.numTableByTopic.head(locK)
318
- += localData[i].numTableByTopic.head(locK) - tState.numTableByTopic.head(locK);
319
316
  globalState.numByTopicWord.block(0, 0, locK, V)
320
317
  += localData[i].numByTopicWord.block(0, 0, locK, V) - tState.numByTopicWord.block(0, 0, locK, V);
321
318
  }
@@ -327,10 +324,16 @@ namespace tomoto
327
324
  globalState.numByTopicWord = globalState.numByTopicWord.cwiseMax(0);
328
325
  }
329
326
 
330
- globalState.totalTable = accumulate(this->docs.begin(), this->docs.end(), 0, [](size_t sum, const _DocType& doc)
327
+
328
+ globalState.numTableByTopic.setZero();
329
+ for (auto& doc : this->docs)
331
330
  {
332
- return sum + doc.getNumTable();
333
- });
331
+ for (auto& table : doc.numTopicByTable)
332
+ {
333
+ if (table) globalState.numTableByTopic[table.topic]++;
334
+ }
335
+ }
336
+ globalState.totalTable = globalState.numTableByTopic.sum();
334
337
 
335
338
  for (size_t i = 0; i < pool.getNumWorkers(); ++i)
336
339
  {
@@ -368,19 +371,13 @@ namespace tomoto
368
371
  const auto eta = this->eta;
369
372
  double ll = 0;
370
373
  // table partition ll
371
- size_t liveK = 0;
372
- for (Tid k = 0; k < K; ++k)
373
- {
374
- if (!isLiveTopic(k)) continue;
375
- ll += math::lgammaT(ld.numTableByTopic[k]);
376
- ++liveK;
377
- }
378
-
374
+ size_t liveK = (ld.numTableByTopic.array() > 0).template cast<size_t>().sum();
375
+ Eigen::ArrayXf lg = math::lgammaApprox(ld.numTableByTopic.array().template cast<Float>());
376
+ ll += (ld.numTableByTopic.array() > 0).select(lg, 0).sum();
379
377
  ll += liveK * log(gamma) - math::lgammaT(ld.totalTable + gamma) + math::lgammaT(gamma);
380
378
 
381
379
  // topic word ll
382
380
  ll += liveK * math::lgammaT(V * eta);
383
-
384
381
  for (Tid k = 0; k < K; ++k)
385
382
  {
386
383
  if (!isLiveTopic(k)) continue;
@@ -545,8 +542,7 @@ namespace tomoto
545
542
 
546
543
  for (auto& doc : this->docs)
547
544
  {
548
- auto d = lda->_makeDoc(std::vector<std::string>{});
549
- for(auto w : doc.words) d.words.emplace_back(w);
545
+ auto d = lda->_makeFromRawDoc(doc);
550
546
  lda->_addDoc(d);
551
547
  }
552
548
 
@@ -101,7 +101,7 @@ namespace tomoto
101
101
  void addPathOne()
102
102
  {
103
103
  NCRPNode* node = this;
104
- for (size_t i = 0; i <= level; ++i)
104
+ for (size_t i = 0; i <= (size_t)level; ++i)
105
105
  {
106
106
  ++node->numCustomers;
107
107
  node = node->getParent();
@@ -132,7 +132,7 @@ namespace tomoto
132
132
  {
133
133
  size_t idx = node - nodes.data();
134
134
  const Float pNewNode = _MakeNewPath ? log(gamma / (node->numCustomers + gamma)) : -INFINITY;
135
- nodeLikelihoods[idx] = weight + ((node->level < levelDepth - 1) ? pNewNode : 0);
135
+ nodeLikelihoods[idx] = weight + (((size_t)node->level < levelDepth - 1) ? pNewNode : 0);
136
136
  for(auto * child = node->getChild(); child; child = child->getSibling())
137
137
  {
138
138
  updateNodeLikelihood(gamma, levelDepth, child, weight + log(child->numCustomers / (node->numCustomers + gamma)));
@@ -279,7 +279,7 @@ namespace tomoto
279
279
  nodes[idx].level = l;
280
280
  }
281
281
 
282
- if (ld.numByTopic.size() < nodes.size())
282
+ if ((size_t)ld.numByTopic.size() < nodes.size())
283
283
  {
284
284
  size_t oldSize = ld.numByTopic.rows();
285
285
  size_t newSize = std::max(nodes.size(), ((oldSize + oldSize / 2 + 7) / 8) * 8);
@@ -117,17 +117,6 @@ namespace tomoto
117
117
  Float _alpha = 0.1, Float _eta = 0.01, size_t seed = std::random_device{}(),
118
118
  bool scalarRng = false);
119
119
 
120
- virtual size_t addDoc(const std::vector<std::string>& words) = 0;
121
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words) const = 0;
122
-
123
- virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) = 0;
124
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) const = 0;
125
-
126
- virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
127
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) = 0;
128
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
129
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) const = 0;
130
-
131
120
  virtual TermWeight getTermWeight() const = 0;
132
121
  virtual size_t getOptimInterval() const = 0;
133
122
  virtual void setOptimInterval(size_t) = 0;
@@ -609,7 +609,7 @@ namespace tomoto
609
609
 
610
610
  double getLL() const
611
611
  {
612
- return static_cast<const DerivedClass*>(this)->template getLLDocs<>(this->docs.begin(), this->docs.end())
612
+ return static_cast<const DerivedClass*>(this)->getLLDocs(this->docs.begin(), this->docs.end())
613
613
  + static_cast<const DerivedClass*>(this)->getLLRest(this->globalState);
614
614
  }
615
615
 
@@ -898,36 +898,24 @@ namespace tomoto
898
898
  burnIn = iteration;
899
899
  }
900
900
 
901
- size_t addDoc(const std::vector<std::string>& words) override
901
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
902
902
  {
903
- return this->_addDoc(this->_makeDoc(words));
903
+ return this->_addDoc(this->template _makeFromRawDoc<false>(rawDoc, tokenizer));
904
904
  }
905
905
 
906
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words) const override
906
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
907
907
  {
908
- return make_unique<_DocType>(as_mutable(this)->template _makeDoc<true>(words));
908
+ return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer));
909
909
  }
910
910
 
911
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) override
911
+ size_t addDoc(const RawDoc& rawDoc) override
912
912
  {
913
- return this->_addDoc(this->template _makeRawDoc<false>(rawStr, tokenizer));
913
+ return this->_addDoc(this->_makeFromRawDoc(rawDoc));
914
914
  }
915
915
 
916
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) const override
916
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
917
917
  {
918
- return make_unique<_DocType>(as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer));
919
- }
920
-
921
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
922
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) override
923
- {
924
- return this->_addDoc(this->_makeRawDoc(rawStr, words, pos, len));
925
- }
926
-
927
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
928
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) const override
929
- {
930
- return make_unique<_DocType>(this->_makeRawDoc(rawStr, words, pos, len));
918
+ return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc));
931
919
  }
932
920
 
933
921
  void setWordPrior(const std::string& word, const std::vector<Float>& priors) override
@@ -23,21 +23,6 @@ namespace tomoto
23
23
  Float alpha = 0.1, Float eta = 0.01, size_t seed = std::random_device{}(),
24
24
  bool scalarRng = false);
25
25
 
26
- virtual size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& label) = 0;
27
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& label) const = 0;
28
-
29
- virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
30
- const std::vector<std::string>& label) = 0;
31
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
32
- const std::vector<std::string>& label) const = 0;
33
-
34
- virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
35
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
36
- const std::vector<std::string>& label) = 0;
37
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
38
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
39
- const std::vector<std::string>& label) const = 0;
40
-
41
26
  virtual const Dictionary& getTopicLabelDict() const = 0;
42
27
 
43
28
  virtual size_t getNumTopicsPerLabel() const = 0;
@@ -144,46 +144,28 @@ namespace tomoto
144
144
  return doc;
145
145
  }
146
146
 
147
- size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& labels) override
147
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
148
148
  {
149
- auto doc = this->_makeDoc(words);
150
- return this->_addDoc(_updateDoc(doc, labels));
149
+ auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
150
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
151
151
  }
152
152
 
153
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& labels) const override
153
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
154
154
  {
155
- auto doc = as_mutable(this)->template _makeDoc<true>(words);
156
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
155
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
156
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
157
157
  }
158
158
 
159
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
160
- const std::vector<std::string>& labels) override
159
+ size_t addDoc(const RawDoc& rawDoc) override
161
160
  {
162
- auto doc = this->template _makeRawDoc<false>(rawStr, tokenizer);
163
- return this->_addDoc(_updateDoc(doc, labels));
161
+ auto doc = this->_makeFromRawDoc(rawDoc);
162
+ return this->_addDoc(_updateDoc(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
164
163
  }
165
164
 
166
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
167
- const std::vector<std::string>& labels) const override
165
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
168
166
  {
169
- auto doc = as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer);
170
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
171
- }
172
-
173
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
174
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
175
- const std::vector<std::string>& labels) override
176
- {
177
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
178
- return this->_addDoc(_updateDoc(doc, labels));
179
- }
180
-
181
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
182
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
183
- const std::vector<std::string>& labels) const override
184
- {
185
- auto doc = this->_makeRawDoc(rawStr, words, pos, len);
186
- return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
167
+ auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
168
+ return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
187
169
  }
188
170
 
189
171
  std::vector<Float> getTopicsByDoc(const _DocType& doc) const
@@ -37,21 +37,6 @@ namespace tomoto
37
37
  Float _etaG = 0.01, Float _etaL = 0.01, Float _gamma = 0.1, size_t seed = std::random_device{}(),
38
38
  bool scalarRng = false);
39
39
 
40
- virtual size_t addDoc(const std::vector<std::string>& words, const std::string& delimiter) = 0;
41
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::string& delimiter) const = 0;
42
-
43
- virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
44
- const std::string& delimiter) = 0;
45
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
46
- const std::string& delimiter) const = 0;
47
-
48
- virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
49
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
50
- const std::string& delimiter) = 0;
51
- virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
52
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
53
- const std::string& delimiter) const = 0;
54
-
55
40
  virtual size_t getKL() const = 0;
56
41
  virtual size_t getT() const = 0;
57
42
  virtual Float getAlphaL() const = 0;
@@ -385,53 +385,12 @@ namespace tomoto
385
385
  if (_etaL <= 0) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong etaL value (etaL = %f)", _etaL));
386
386
  }
387
387
 
388
-
389
- template<bool _const = false>
390
- _DocType _makeDoc(const std::vector<std::string>& words, const std::string& delimiter)
391
- {
392
- _DocType doc{ 1.f };
393
- size_t numSent = 0;
394
- for (auto& w : words)
395
- {
396
- if (w == delimiter)
397
- {
398
- ++numSent;
399
- continue;
400
- }
401
-
402
- Vid id;
403
- if (_const)
404
- {
405
- id = this->dict.toWid(w);
406
- if (id == (Vid)-1) continue;
407
- }
408
- else
409
- {
410
- id = this->dict.add(w);
411
- }
412
- doc.words.emplace_back(id);
413
- doc.sents.emplace_back(numSent);
414
- }
415
- doc.numBySent.resize(doc.sents.empty() ? 0 : (doc.sents.back() + 1));
416
- return doc;
417
- }
418
-
419
- size_t addDoc(const std::vector<std::string>& words, const std::string& delimiter) override
420
- {
421
- return this->_addDoc(_makeDoc(words, delimiter));
422
- }
423
-
424
- std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::string& delimiter) const override
425
- {
426
- return make_unique<_DocType>(as_mutable(this)->template _makeDoc<true>(words, delimiter));
427
- }
428
-
429
388
  template<bool _const, typename _FnTokenizer>
430
- _DocType _makeRawDoc(const std::string& rawStr, _FnTokenizer&& tokenizer, const std::string& delimiter)
389
+ _DocType _makeFromRawDoc(const RawDoc& rawDoc, _FnTokenizer&& tokenizer, const std::string& delimiter)
431
390
  {
432
- _DocType doc{ 1.f };
391
+ _DocType doc;
433
392
  size_t numSent = 0;
434
- doc.rawStr = rawStr;
393
+ doc.rawStr = rawDoc.rawStr;
435
394
  for (auto& p : tokenizer(doc.rawStr))
436
395
  {
437
396
  if (std::get<0>(p) == delimiter)
@@ -461,57 +420,85 @@ namespace tomoto
461
420
  return doc;
462
421
  }
463
422
 
464
- size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
465
- const std::string& delimiter)
423
+ size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer)
466
424
  {
467
- return this->_addDoc(_makeRawDoc<false>(rawStr, tokenizer, delimiter));
425
+ return this->_addDoc(_makeFromRawDoc<false>(rawDoc, tokenizer, rawDoc.template getMisc<std::string>("delimiter")));
468
426
  }
469
427
 
470
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
471
- const std::string& delimiter) const
428
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const
472
429
  {
473
- return make_unique<_DocType>(as_mutable(this)->template _makeRawDoc<true>(rawStr, tokenizer, delimiter));
430
+ return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer, rawDoc.template getMisc<std::string>("delimiter")));
474
431
  }
475
432
 
476
- _DocType _makeRawDoc(const std::string& rawStr, const std::vector<Vid>& words,
477
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len, const std::string& delimiter) const
433
+ template<bool _const = false>
434
+ _DocType _makeFromRawDoc(const RawDoc& rawDoc)
478
435
  {
479
- _DocType doc{ 1.f };
480
- doc.rawStr = rawStr;
436
+ _DocType doc;
437
+ doc.rawStr = rawDoc.rawStr;
438
+ auto delimiter = rawDoc.template getMisc<std::string>("delimiter");
481
439
  size_t numSent = 0;
482
440
  Vid delimiterId = this->dict.toWid(delimiter);
483
- for (size_t i = 0; i < words.size(); ++i)
441
+ if (!rawDoc.rawWords.empty())
484
442
  {
485
- auto& w = words[i];
486
- if (w == delimiterId)
443
+ for (size_t i = 0; i < rawDoc.rawWords.size(); ++i)
487
444
  {
488
- ++numSent;
489
- continue;
445
+ auto& w = rawDoc.rawWords[i];
446
+ if (w == delimiter)
447
+ {
448
+ ++numSent;
449
+ continue;
450
+ }
451
+
452
+ Vid id;
453
+ if (_const)
454
+ {
455
+ id = this->dict.toWid(w);
456
+ if (id == (Vid)-1) continue;
457
+ }
458
+ else
459
+ {
460
+ id = this->dict.add(w);
461
+ }
462
+ doc.words.emplace_back(id);
463
+ doc.sents.emplace_back(numSent);
464
+ if (rawDoc.rawWords.size() == rawDoc.origWordPos.size())
465
+ {
466
+ doc.origWordPos.emplace_back(rawDoc.origWordPos[i]);
467
+ doc.origWordLen.emplace_back(rawDoc.origWordLen[i]);
468
+ }
490
469
  }
491
- doc.words.emplace_back(w);
492
- doc.sents.emplace_back(numSent);
493
- if (words.size() == pos.size())
470
+ }
471
+ else if (!rawDoc.words.empty())
472
+ {
473
+ for (size_t i = 0; i < rawDoc.words.size(); ++i)
494
474
  {
495
- doc.origWordPos.emplace_back(pos[i]);
496
- doc.origWordLen.emplace_back(len[i]);
475
+ auto& w = rawDoc.words[i];
476
+ if (w == delimiterId)
477
+ {
478
+ ++numSent;
479
+ continue;
480
+ }
481
+ doc.words.emplace_back(w);
482
+ doc.sents.emplace_back(numSent);
483
+ if (rawDoc.words.size() == rawDoc.origWordPos.size())
484
+ {
485
+ doc.origWordPos.emplace_back(rawDoc.origWordPos[i]);
486
+ doc.origWordLen.emplace_back(rawDoc.origWordLen[i]);
487
+ }
497
488
  }
498
489
  }
499
490
  doc.numBySent.resize(doc.sents.empty() ? 0 : (doc.sents.back() + 1));
500
491
  return doc;
501
492
  }
502
493
 
503
- size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
504
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
505
- const std::string& delimiter)
494
+ size_t addDoc(const RawDoc& rawDoc)
506
495
  {
507
- return this->_addDoc(_makeRawDoc(rawStr, words, pos, len, delimiter));
496
+ return this->_addDoc(_makeFromRawDoc(rawDoc));
508
497
  }
509
498
 
510
- std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
511
- const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
512
- const std::string& delimiter) const
499
+ std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const
513
500
  {
514
- return make_unique<_DocType>(_makeRawDoc(rawStr, words, pos, len, delimiter));
501
+ return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc));
515
502
  }
516
503
 
517
504
  void setWordPrior(const std::string& word, const std::vector<Float>& priors) override