tomoto 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/README.md +3 -3
- data/ext/tomoto/ext.cpp +34 -9
- data/ext/tomoto/extconf.rb +2 -1
- data/lib/tomoto/dmr.rb +1 -1
- data/lib/tomoto/gdmr.rb +1 -1
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/LICENSE +1 -1
- data/vendor/tomotopy/README.kr.rst +32 -3
- data/vendor/tomotopy/README.rst +30 -1
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
- data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
- data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
- data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
- data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
- data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
- data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
- data/vendor/tomotopy/src/Utils/math.h +1 -1
- data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
- data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
- data/vendor/variant/LICENSE +25 -0
- data/vendor/variant/LICENSE_1_0.txt +23 -0
- data/vendor/variant/README.md +102 -0
- data/vendor/variant/include/mapbox/optional.hpp +74 -0
- data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
- data/vendor/variant/include/mapbox/variant.hpp +974 -0
- data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
- metadata +15 -7
@@ -84,12 +84,12 @@ namespace tomoto
|
|
84
84
|
{
|
85
85
|
const size_t V = this->realV;
|
86
86
|
size_t pos;
|
87
|
-
for (pos = 0; pos < ld.numTableByTopic.size(); ++pos)
|
87
|
+
for (pos = 0; pos < (size_t)ld.numTableByTopic.size(); ++pos)
|
88
88
|
{
|
89
89
|
if (!ld.numTableByTopic[pos]) break;
|
90
90
|
}
|
91
91
|
|
92
|
-
if (pos >= ld.numByTopic.size())
|
92
|
+
if (pos >= (size_t)ld.numByTopic.size())
|
93
93
|
{
|
94
94
|
size_t oldSize = ld.numByTopic.size(), newSize = pos + 1;
|
95
95
|
ld.numTableByTopic.conservativeResize(newSize);
|
@@ -302,8 +302,7 @@ namespace tomoto
|
|
302
302
|
size_t oldSize = globalState.numByTopic.size();
|
303
303
|
globalState.numByTopic.conservativeResize(K);
|
304
304
|
globalState.numByTopic.tail(K - oldSize).setZero();
|
305
|
-
globalState.numTableByTopic.
|
306
|
-
globalState.numTableByTopic.tail(K - oldSize).setZero();
|
305
|
+
globalState.numTableByTopic.resize(K);
|
307
306
|
globalState.numByTopicWord.conservativeResize(K, Eigen::NoChange);
|
308
307
|
globalState.numByTopicWord.block(oldSize, 0, K - oldSize, V).setZero();
|
309
308
|
}
|
@@ -314,8 +313,6 @@ namespace tomoto
|
|
314
313
|
size_t locK = localData[i].numByTopic.size();
|
315
314
|
globalState.numByTopic.head(locK)
|
316
315
|
+= localData[i].numByTopic.head(locK) - tState.numByTopic.head(locK);
|
317
|
-
globalState.numTableByTopic.head(locK)
|
318
|
-
+= localData[i].numTableByTopic.head(locK) - tState.numTableByTopic.head(locK);
|
319
316
|
globalState.numByTopicWord.block(0, 0, locK, V)
|
320
317
|
+= localData[i].numByTopicWord.block(0, 0, locK, V) - tState.numByTopicWord.block(0, 0, locK, V);
|
321
318
|
}
|
@@ -327,10 +324,16 @@ namespace tomoto
|
|
327
324
|
globalState.numByTopicWord = globalState.numByTopicWord.cwiseMax(0);
|
328
325
|
}
|
329
326
|
|
330
|
-
|
327
|
+
|
328
|
+
globalState.numTableByTopic.setZero();
|
329
|
+
for (auto& doc : this->docs)
|
331
330
|
{
|
332
|
-
|
333
|
-
|
331
|
+
for (auto& table : doc.numTopicByTable)
|
332
|
+
{
|
333
|
+
if (table) globalState.numTableByTopic[table.topic]++;
|
334
|
+
}
|
335
|
+
}
|
336
|
+
globalState.totalTable = globalState.numTableByTopic.sum();
|
334
337
|
|
335
338
|
for (size_t i = 0; i < pool.getNumWorkers(); ++i)
|
336
339
|
{
|
@@ -368,19 +371,13 @@ namespace tomoto
|
|
368
371
|
const auto eta = this->eta;
|
369
372
|
double ll = 0;
|
370
373
|
// table partition ll
|
371
|
-
size_t liveK = 0;
|
372
|
-
|
373
|
-
|
374
|
-
if (!isLiveTopic(k)) continue;
|
375
|
-
ll += math::lgammaT(ld.numTableByTopic[k]);
|
376
|
-
++liveK;
|
377
|
-
}
|
378
|
-
|
374
|
+
size_t liveK = (ld.numTableByTopic.array() > 0).template cast<size_t>().sum();
|
375
|
+
Eigen::ArrayXf lg = math::lgammaApprox(ld.numTableByTopic.array().template cast<Float>());
|
376
|
+
ll += (ld.numTableByTopic.array() > 0).select(lg, 0).sum();
|
379
377
|
ll += liveK * log(gamma) - math::lgammaT(ld.totalTable + gamma) + math::lgammaT(gamma);
|
380
378
|
|
381
379
|
// topic word ll
|
382
380
|
ll += liveK * math::lgammaT(V * eta);
|
383
|
-
|
384
381
|
for (Tid k = 0; k < K; ++k)
|
385
382
|
{
|
386
383
|
if (!isLiveTopic(k)) continue;
|
@@ -545,8 +542,7 @@ namespace tomoto
|
|
545
542
|
|
546
543
|
for (auto& doc : this->docs)
|
547
544
|
{
|
548
|
-
auto d = lda->
|
549
|
-
for(auto w : doc.words) d.words.emplace_back(w);
|
545
|
+
auto d = lda->_makeFromRawDoc(doc);
|
550
546
|
lda->_addDoc(d);
|
551
547
|
}
|
552
548
|
|
@@ -101,7 +101,7 @@ namespace tomoto
|
|
101
101
|
void addPathOne()
|
102
102
|
{
|
103
103
|
NCRPNode* node = this;
|
104
|
-
for (size_t i = 0; i <= level; ++i)
|
104
|
+
for (size_t i = 0; i <= (size_t)level; ++i)
|
105
105
|
{
|
106
106
|
++node->numCustomers;
|
107
107
|
node = node->getParent();
|
@@ -132,7 +132,7 @@ namespace tomoto
|
|
132
132
|
{
|
133
133
|
size_t idx = node - nodes.data();
|
134
134
|
const Float pNewNode = _MakeNewPath ? log(gamma / (node->numCustomers + gamma)) : -INFINITY;
|
135
|
-
nodeLikelihoods[idx] = weight + ((node->level < levelDepth - 1) ? pNewNode : 0);
|
135
|
+
nodeLikelihoods[idx] = weight + (((size_t)node->level < levelDepth - 1) ? pNewNode : 0);
|
136
136
|
for(auto * child = node->getChild(); child; child = child->getSibling())
|
137
137
|
{
|
138
138
|
updateNodeLikelihood(gamma, levelDepth, child, weight + log(child->numCustomers / (node->numCustomers + gamma)));
|
@@ -279,7 +279,7 @@ namespace tomoto
|
|
279
279
|
nodes[idx].level = l;
|
280
280
|
}
|
281
281
|
|
282
|
-
if (ld.numByTopic.size() < nodes.size())
|
282
|
+
if ((size_t)ld.numByTopic.size() < nodes.size())
|
283
283
|
{
|
284
284
|
size_t oldSize = ld.numByTopic.rows();
|
285
285
|
size_t newSize = std::max(nodes.size(), ((oldSize + oldSize / 2 + 7) / 8) * 8);
|
@@ -117,17 +117,6 @@ namespace tomoto
|
|
117
117
|
Float _alpha = 0.1, Float _eta = 0.01, size_t seed = std::random_device{}(),
|
118
118
|
bool scalarRng = false);
|
119
119
|
|
120
|
-
virtual size_t addDoc(const std::vector<std::string>& words) = 0;
|
121
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words) const = 0;
|
122
|
-
|
123
|
-
virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) = 0;
|
124
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) const = 0;
|
125
|
-
|
126
|
-
virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
127
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) = 0;
|
128
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
129
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) const = 0;
|
130
|
-
|
131
120
|
virtual TermWeight getTermWeight() const = 0;
|
132
121
|
virtual size_t getOptimInterval() const = 0;
|
133
122
|
virtual void setOptimInterval(size_t) = 0;
|
@@ -609,7 +609,7 @@ namespace tomoto
|
|
609
609
|
|
610
610
|
double getLL() const
|
611
611
|
{
|
612
|
-
return static_cast<const DerivedClass*>(this)->
|
612
|
+
return static_cast<const DerivedClass*>(this)->getLLDocs(this->docs.begin(), this->docs.end())
|
613
613
|
+ static_cast<const DerivedClass*>(this)->getLLRest(this->globalState);
|
614
614
|
}
|
615
615
|
|
@@ -898,36 +898,24 @@ namespace tomoto
|
|
898
898
|
burnIn = iteration;
|
899
899
|
}
|
900
900
|
|
901
|
-
size_t addDoc(const
|
901
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
|
902
902
|
{
|
903
|
-
return this->_addDoc(this->
|
903
|
+
return this->_addDoc(this->template _makeFromRawDoc<false>(rawDoc, tokenizer));
|
904
904
|
}
|
905
905
|
|
906
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
906
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
|
907
907
|
{
|
908
|
-
return make_unique<_DocType>(as_mutable(this)->template
|
908
|
+
return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer));
|
909
909
|
}
|
910
910
|
|
911
|
-
size_t addDoc(const
|
911
|
+
size_t addDoc(const RawDoc& rawDoc) override
|
912
912
|
{
|
913
|
-
return this->_addDoc(this->
|
913
|
+
return this->_addDoc(this->_makeFromRawDoc(rawDoc));
|
914
914
|
}
|
915
915
|
|
916
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
916
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
|
917
917
|
{
|
918
|
-
return make_unique<_DocType>(as_mutable(this)->template
|
919
|
-
}
|
920
|
-
|
921
|
-
size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
922
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) override
|
923
|
-
{
|
924
|
-
return this->_addDoc(this->_makeRawDoc(rawStr, words, pos, len));
|
925
|
-
}
|
926
|
-
|
927
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
928
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) const override
|
929
|
-
{
|
930
|
-
return make_unique<_DocType>(this->_makeRawDoc(rawStr, words, pos, len));
|
918
|
+
return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc));
|
931
919
|
}
|
932
920
|
|
933
921
|
void setWordPrior(const std::string& word, const std::vector<Float>& priors) override
|
@@ -23,21 +23,6 @@ namespace tomoto
|
|
23
23
|
Float alpha = 0.1, Float eta = 0.01, size_t seed = std::random_device{}(),
|
24
24
|
bool scalarRng = false);
|
25
25
|
|
26
|
-
virtual size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& label) = 0;
|
27
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& label) const = 0;
|
28
|
-
|
29
|
-
virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
30
|
-
const std::vector<std::string>& label) = 0;
|
31
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
32
|
-
const std::vector<std::string>& label) const = 0;
|
33
|
-
|
34
|
-
virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
35
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
36
|
-
const std::vector<std::string>& label) = 0;
|
37
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
38
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
39
|
-
const std::vector<std::string>& label) const = 0;
|
40
|
-
|
41
26
|
virtual const Dictionary& getTopicLabelDict() const = 0;
|
42
27
|
|
43
28
|
virtual size_t getNumTopicsPerLabel() const = 0;
|
@@ -144,46 +144,28 @@ namespace tomoto
|
|
144
144
|
return doc;
|
145
145
|
}
|
146
146
|
|
147
|
-
size_t addDoc(const
|
147
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
|
148
148
|
{
|
149
|
-
auto doc = this->
|
150
|
-
return this->_addDoc(_updateDoc(doc, labels));
|
149
|
+
auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
|
150
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
|
151
151
|
}
|
152
152
|
|
153
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
153
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
|
154
154
|
{
|
155
|
-
auto doc = as_mutable(this)->template
|
156
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
|
155
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
|
156
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
|
157
157
|
}
|
158
158
|
|
159
|
-
size_t addDoc(const
|
160
|
-
const std::vector<std::string>& labels) override
|
159
|
+
size_t addDoc(const RawDoc& rawDoc) override
|
161
160
|
{
|
162
|
-
auto doc = this->
|
163
|
-
return this->_addDoc(_updateDoc(doc, labels));
|
161
|
+
auto doc = this->_makeFromRawDoc(rawDoc);
|
162
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
|
164
163
|
}
|
165
164
|
|
166
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
167
|
-
const std::vector<std::string>& labels) const override
|
165
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
|
168
166
|
{
|
169
|
-
auto doc = as_mutable(this)->template
|
170
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
|
171
|
-
}
|
172
|
-
|
173
|
-
size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
174
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
175
|
-
const std::vector<std::string>& labels) override
|
176
|
-
{
|
177
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
178
|
-
return this->_addDoc(_updateDoc(doc, labels));
|
179
|
-
}
|
180
|
-
|
181
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
182
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
183
|
-
const std::vector<std::string>& labels) const override
|
184
|
-
{
|
185
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
186
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
|
167
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
|
168
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
|
187
169
|
}
|
188
170
|
|
189
171
|
std::vector<Float> getTopicsByDoc(const _DocType& doc) const
|
@@ -37,21 +37,6 @@ namespace tomoto
|
|
37
37
|
Float _etaG = 0.01, Float _etaL = 0.01, Float _gamma = 0.1, size_t seed = std::random_device{}(),
|
38
38
|
bool scalarRng = false);
|
39
39
|
|
40
|
-
virtual size_t addDoc(const std::vector<std::string>& words, const std::string& delimiter) = 0;
|
41
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::string& delimiter) const = 0;
|
42
|
-
|
43
|
-
virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
44
|
-
const std::string& delimiter) = 0;
|
45
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
46
|
-
const std::string& delimiter) const = 0;
|
47
|
-
|
48
|
-
virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
49
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
50
|
-
const std::string& delimiter) = 0;
|
51
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
52
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
53
|
-
const std::string& delimiter) const = 0;
|
54
|
-
|
55
40
|
virtual size_t getKL() const = 0;
|
56
41
|
virtual size_t getT() const = 0;
|
57
42
|
virtual Float getAlphaL() const = 0;
|
@@ -385,53 +385,12 @@ namespace tomoto
|
|
385
385
|
if (_etaL <= 0) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong etaL value (etaL = %f)", _etaL));
|
386
386
|
}
|
387
387
|
|
388
|
-
|
389
|
-
template<bool _const = false>
|
390
|
-
_DocType _makeDoc(const std::vector<std::string>& words, const std::string& delimiter)
|
391
|
-
{
|
392
|
-
_DocType doc{ 1.f };
|
393
|
-
size_t numSent = 0;
|
394
|
-
for (auto& w : words)
|
395
|
-
{
|
396
|
-
if (w == delimiter)
|
397
|
-
{
|
398
|
-
++numSent;
|
399
|
-
continue;
|
400
|
-
}
|
401
|
-
|
402
|
-
Vid id;
|
403
|
-
if (_const)
|
404
|
-
{
|
405
|
-
id = this->dict.toWid(w);
|
406
|
-
if (id == (Vid)-1) continue;
|
407
|
-
}
|
408
|
-
else
|
409
|
-
{
|
410
|
-
id = this->dict.add(w);
|
411
|
-
}
|
412
|
-
doc.words.emplace_back(id);
|
413
|
-
doc.sents.emplace_back(numSent);
|
414
|
-
}
|
415
|
-
doc.numBySent.resize(doc.sents.empty() ? 0 : (doc.sents.back() + 1));
|
416
|
-
return doc;
|
417
|
-
}
|
418
|
-
|
419
|
-
size_t addDoc(const std::vector<std::string>& words, const std::string& delimiter) override
|
420
|
-
{
|
421
|
-
return this->_addDoc(_makeDoc(words, delimiter));
|
422
|
-
}
|
423
|
-
|
424
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::string& delimiter) const override
|
425
|
-
{
|
426
|
-
return make_unique<_DocType>(as_mutable(this)->template _makeDoc<true>(words, delimiter));
|
427
|
-
}
|
428
|
-
|
429
388
|
template<bool _const, typename _FnTokenizer>
|
430
|
-
_DocType
|
389
|
+
_DocType _makeFromRawDoc(const RawDoc& rawDoc, _FnTokenizer&& tokenizer, const std::string& delimiter)
|
431
390
|
{
|
432
|
-
_DocType doc
|
391
|
+
_DocType doc;
|
433
392
|
size_t numSent = 0;
|
434
|
-
doc.rawStr = rawStr;
|
393
|
+
doc.rawStr = rawDoc.rawStr;
|
435
394
|
for (auto& p : tokenizer(doc.rawStr))
|
436
395
|
{
|
437
396
|
if (std::get<0>(p) == delimiter)
|
@@ -461,57 +420,85 @@ namespace tomoto
|
|
461
420
|
return doc;
|
462
421
|
}
|
463
422
|
|
464
|
-
size_t addDoc(const
|
465
|
-
const std::string& delimiter)
|
423
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer)
|
466
424
|
{
|
467
|
-
return this->_addDoc(
|
425
|
+
return this->_addDoc(_makeFromRawDoc<false>(rawDoc, tokenizer, rawDoc.template getMisc<std::string>("delimiter")));
|
468
426
|
}
|
469
427
|
|
470
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
471
|
-
const std::string& delimiter) const
|
428
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const
|
472
429
|
{
|
473
|
-
return make_unique<_DocType>(as_mutable(this)->template
|
430
|
+
return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer, rawDoc.template getMisc<std::string>("delimiter")));
|
474
431
|
}
|
475
432
|
|
476
|
-
|
477
|
-
|
433
|
+
template<bool _const = false>
|
434
|
+
_DocType _makeFromRawDoc(const RawDoc& rawDoc)
|
478
435
|
{
|
479
|
-
_DocType doc
|
480
|
-
doc.rawStr = rawStr;
|
436
|
+
_DocType doc;
|
437
|
+
doc.rawStr = rawDoc.rawStr;
|
438
|
+
auto delimiter = rawDoc.template getMisc<std::string>("delimiter");
|
481
439
|
size_t numSent = 0;
|
482
440
|
Vid delimiterId = this->dict.toWid(delimiter);
|
483
|
-
|
441
|
+
if (!rawDoc.rawWords.empty())
|
484
442
|
{
|
485
|
-
|
486
|
-
if (w == delimiterId)
|
443
|
+
for (size_t i = 0; i < rawDoc.rawWords.size(); ++i)
|
487
444
|
{
|
488
|
-
|
489
|
-
|
445
|
+
auto& w = rawDoc.rawWords[i];
|
446
|
+
if (w == delimiter)
|
447
|
+
{
|
448
|
+
++numSent;
|
449
|
+
continue;
|
450
|
+
}
|
451
|
+
|
452
|
+
Vid id;
|
453
|
+
if (_const)
|
454
|
+
{
|
455
|
+
id = this->dict.toWid(w);
|
456
|
+
if (id == (Vid)-1) continue;
|
457
|
+
}
|
458
|
+
else
|
459
|
+
{
|
460
|
+
id = this->dict.add(w);
|
461
|
+
}
|
462
|
+
doc.words.emplace_back(id);
|
463
|
+
doc.sents.emplace_back(numSent);
|
464
|
+
if (rawDoc.rawWords.size() == rawDoc.origWordPos.size())
|
465
|
+
{
|
466
|
+
doc.origWordPos.emplace_back(rawDoc.origWordPos[i]);
|
467
|
+
doc.origWordLen.emplace_back(rawDoc.origWordLen[i]);
|
468
|
+
}
|
490
469
|
}
|
491
|
-
|
492
|
-
|
493
|
-
|
470
|
+
}
|
471
|
+
else if (!rawDoc.words.empty())
|
472
|
+
{
|
473
|
+
for (size_t i = 0; i < rawDoc.words.size(); ++i)
|
494
474
|
{
|
495
|
-
|
496
|
-
|
475
|
+
auto& w = rawDoc.words[i];
|
476
|
+
if (w == delimiterId)
|
477
|
+
{
|
478
|
+
++numSent;
|
479
|
+
continue;
|
480
|
+
}
|
481
|
+
doc.words.emplace_back(w);
|
482
|
+
doc.sents.emplace_back(numSent);
|
483
|
+
if (rawDoc.words.size() == rawDoc.origWordPos.size())
|
484
|
+
{
|
485
|
+
doc.origWordPos.emplace_back(rawDoc.origWordPos[i]);
|
486
|
+
doc.origWordLen.emplace_back(rawDoc.origWordLen[i]);
|
487
|
+
}
|
497
488
|
}
|
498
489
|
}
|
499
490
|
doc.numBySent.resize(doc.sents.empty() ? 0 : (doc.sents.back() + 1));
|
500
491
|
return doc;
|
501
492
|
}
|
502
493
|
|
503
|
-
size_t addDoc(const
|
504
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
505
|
-
const std::string& delimiter)
|
494
|
+
size_t addDoc(const RawDoc& rawDoc)
|
506
495
|
{
|
507
|
-
return this->_addDoc(
|
496
|
+
return this->_addDoc(_makeFromRawDoc(rawDoc));
|
508
497
|
}
|
509
498
|
|
510
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
511
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
512
|
-
const std::string& delimiter) const
|
499
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const
|
513
500
|
{
|
514
|
-
return make_unique<_DocType>(
|
501
|
+
return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc));
|
515
502
|
}
|
516
503
|
|
517
504
|
void setWordPrior(const std::string& word, const std::vector<Float>& priors) override
|