tomoto 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/README.md +3 -3
- data/ext/tomoto/ext.cpp +34 -9
- data/ext/tomoto/extconf.rb +2 -1
- data/lib/tomoto/dmr.rb +1 -1
- data/lib/tomoto/gdmr.rb +1 -1
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/LICENSE +1 -1
- data/vendor/tomotopy/README.kr.rst +32 -3
- data/vendor/tomotopy/README.rst +30 -1
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
- data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
- data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
- data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
- data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
- data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
- data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
- data/vendor/tomotopy/src/Utils/math.h +1 -1
- data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
- data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
- data/vendor/variant/LICENSE +25 -0
- data/vendor/variant/LICENSE_1_0.txt +23 -0
- data/vendor/variant/README.md +102 -0
- data/vendor/variant/include/mapbox/optional.hpp +74 -0
- data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
- data/vendor/variant/include/mapbox/variant.hpp +974 -0
- data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
- metadata +15 -7
@@ -84,12 +84,12 @@ namespace tomoto
|
|
84
84
|
{
|
85
85
|
const size_t V = this->realV;
|
86
86
|
size_t pos;
|
87
|
-
for (pos = 0; pos < ld.numTableByTopic.size(); ++pos)
|
87
|
+
for (pos = 0; pos < (size_t)ld.numTableByTopic.size(); ++pos)
|
88
88
|
{
|
89
89
|
if (!ld.numTableByTopic[pos]) break;
|
90
90
|
}
|
91
91
|
|
92
|
-
if (pos >= ld.numByTopic.size())
|
92
|
+
if (pos >= (size_t)ld.numByTopic.size())
|
93
93
|
{
|
94
94
|
size_t oldSize = ld.numByTopic.size(), newSize = pos + 1;
|
95
95
|
ld.numTableByTopic.conservativeResize(newSize);
|
@@ -302,8 +302,7 @@ namespace tomoto
|
|
302
302
|
size_t oldSize = globalState.numByTopic.size();
|
303
303
|
globalState.numByTopic.conservativeResize(K);
|
304
304
|
globalState.numByTopic.tail(K - oldSize).setZero();
|
305
|
-
globalState.numTableByTopic.
|
306
|
-
globalState.numTableByTopic.tail(K - oldSize).setZero();
|
305
|
+
globalState.numTableByTopic.resize(K);
|
307
306
|
globalState.numByTopicWord.conservativeResize(K, Eigen::NoChange);
|
308
307
|
globalState.numByTopicWord.block(oldSize, 0, K - oldSize, V).setZero();
|
309
308
|
}
|
@@ -314,8 +313,6 @@ namespace tomoto
|
|
314
313
|
size_t locK = localData[i].numByTopic.size();
|
315
314
|
globalState.numByTopic.head(locK)
|
316
315
|
+= localData[i].numByTopic.head(locK) - tState.numByTopic.head(locK);
|
317
|
-
globalState.numTableByTopic.head(locK)
|
318
|
-
+= localData[i].numTableByTopic.head(locK) - tState.numTableByTopic.head(locK);
|
319
316
|
globalState.numByTopicWord.block(0, 0, locK, V)
|
320
317
|
+= localData[i].numByTopicWord.block(0, 0, locK, V) - tState.numByTopicWord.block(0, 0, locK, V);
|
321
318
|
}
|
@@ -327,10 +324,16 @@ namespace tomoto
|
|
327
324
|
globalState.numByTopicWord = globalState.numByTopicWord.cwiseMax(0);
|
328
325
|
}
|
329
326
|
|
330
|
-
|
327
|
+
|
328
|
+
globalState.numTableByTopic.setZero();
|
329
|
+
for (auto& doc : this->docs)
|
331
330
|
{
|
332
|
-
|
333
|
-
|
331
|
+
for (auto& table : doc.numTopicByTable)
|
332
|
+
{
|
333
|
+
if (table) globalState.numTableByTopic[table.topic]++;
|
334
|
+
}
|
335
|
+
}
|
336
|
+
globalState.totalTable = globalState.numTableByTopic.sum();
|
334
337
|
|
335
338
|
for (size_t i = 0; i < pool.getNumWorkers(); ++i)
|
336
339
|
{
|
@@ -368,19 +371,13 @@ namespace tomoto
|
|
368
371
|
const auto eta = this->eta;
|
369
372
|
double ll = 0;
|
370
373
|
// table partition ll
|
371
|
-
size_t liveK = 0;
|
372
|
-
|
373
|
-
|
374
|
-
if (!isLiveTopic(k)) continue;
|
375
|
-
ll += math::lgammaT(ld.numTableByTopic[k]);
|
376
|
-
++liveK;
|
377
|
-
}
|
378
|
-
|
374
|
+
size_t liveK = (ld.numTableByTopic.array() > 0).template cast<size_t>().sum();
|
375
|
+
Eigen::ArrayXf lg = math::lgammaApprox(ld.numTableByTopic.array().template cast<Float>());
|
376
|
+
ll += (ld.numTableByTopic.array() > 0).select(lg, 0).sum();
|
379
377
|
ll += liveK * log(gamma) - math::lgammaT(ld.totalTable + gamma) + math::lgammaT(gamma);
|
380
378
|
|
381
379
|
// topic word ll
|
382
380
|
ll += liveK * math::lgammaT(V * eta);
|
383
|
-
|
384
381
|
for (Tid k = 0; k < K; ++k)
|
385
382
|
{
|
386
383
|
if (!isLiveTopic(k)) continue;
|
@@ -545,8 +542,7 @@ namespace tomoto
|
|
545
542
|
|
546
543
|
for (auto& doc : this->docs)
|
547
544
|
{
|
548
|
-
auto d = lda->
|
549
|
-
for(auto w : doc.words) d.words.emplace_back(w);
|
545
|
+
auto d = lda->_makeFromRawDoc(doc);
|
550
546
|
lda->_addDoc(d);
|
551
547
|
}
|
552
548
|
|
@@ -101,7 +101,7 @@ namespace tomoto
|
|
101
101
|
void addPathOne()
|
102
102
|
{
|
103
103
|
NCRPNode* node = this;
|
104
|
-
for (size_t i = 0; i <= level; ++i)
|
104
|
+
for (size_t i = 0; i <= (size_t)level; ++i)
|
105
105
|
{
|
106
106
|
++node->numCustomers;
|
107
107
|
node = node->getParent();
|
@@ -132,7 +132,7 @@ namespace tomoto
|
|
132
132
|
{
|
133
133
|
size_t idx = node - nodes.data();
|
134
134
|
const Float pNewNode = _MakeNewPath ? log(gamma / (node->numCustomers + gamma)) : -INFINITY;
|
135
|
-
nodeLikelihoods[idx] = weight + ((node->level < levelDepth - 1) ? pNewNode : 0);
|
135
|
+
nodeLikelihoods[idx] = weight + (((size_t)node->level < levelDepth - 1) ? pNewNode : 0);
|
136
136
|
for(auto * child = node->getChild(); child; child = child->getSibling())
|
137
137
|
{
|
138
138
|
updateNodeLikelihood(gamma, levelDepth, child, weight + log(child->numCustomers / (node->numCustomers + gamma)));
|
@@ -279,7 +279,7 @@ namespace tomoto
|
|
279
279
|
nodes[idx].level = l;
|
280
280
|
}
|
281
281
|
|
282
|
-
if (ld.numByTopic.size() < nodes.size())
|
282
|
+
if ((size_t)ld.numByTopic.size() < nodes.size())
|
283
283
|
{
|
284
284
|
size_t oldSize = ld.numByTopic.rows();
|
285
285
|
size_t newSize = std::max(nodes.size(), ((oldSize + oldSize / 2 + 7) / 8) * 8);
|
@@ -117,17 +117,6 @@ namespace tomoto
|
|
117
117
|
Float _alpha = 0.1, Float _eta = 0.01, size_t seed = std::random_device{}(),
|
118
118
|
bool scalarRng = false);
|
119
119
|
|
120
|
-
virtual size_t addDoc(const std::vector<std::string>& words) = 0;
|
121
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words) const = 0;
|
122
|
-
|
123
|
-
virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) = 0;
|
124
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer) const = 0;
|
125
|
-
|
126
|
-
virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
127
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) = 0;
|
128
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
129
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) const = 0;
|
130
|
-
|
131
120
|
virtual TermWeight getTermWeight() const = 0;
|
132
121
|
virtual size_t getOptimInterval() const = 0;
|
133
122
|
virtual void setOptimInterval(size_t) = 0;
|
@@ -609,7 +609,7 @@ namespace tomoto
|
|
609
609
|
|
610
610
|
double getLL() const
|
611
611
|
{
|
612
|
-
return static_cast<const DerivedClass*>(this)->
|
612
|
+
return static_cast<const DerivedClass*>(this)->getLLDocs(this->docs.begin(), this->docs.end())
|
613
613
|
+ static_cast<const DerivedClass*>(this)->getLLRest(this->globalState);
|
614
614
|
}
|
615
615
|
|
@@ -898,36 +898,24 @@ namespace tomoto
|
|
898
898
|
burnIn = iteration;
|
899
899
|
}
|
900
900
|
|
901
|
-
size_t addDoc(const
|
901
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
|
902
902
|
{
|
903
|
-
return this->_addDoc(this->
|
903
|
+
return this->_addDoc(this->template _makeFromRawDoc<false>(rawDoc, tokenizer));
|
904
904
|
}
|
905
905
|
|
906
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
906
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
|
907
907
|
{
|
908
|
-
return make_unique<_DocType>(as_mutable(this)->template
|
908
|
+
return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer));
|
909
909
|
}
|
910
910
|
|
911
|
-
size_t addDoc(const
|
911
|
+
size_t addDoc(const RawDoc& rawDoc) override
|
912
912
|
{
|
913
|
-
return this->_addDoc(this->
|
913
|
+
return this->_addDoc(this->_makeFromRawDoc(rawDoc));
|
914
914
|
}
|
915
915
|
|
916
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
916
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
|
917
917
|
{
|
918
|
-
return make_unique<_DocType>(as_mutable(this)->template
|
919
|
-
}
|
920
|
-
|
921
|
-
size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
922
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) override
|
923
|
-
{
|
924
|
-
return this->_addDoc(this->_makeRawDoc(rawStr, words, pos, len));
|
925
|
-
}
|
926
|
-
|
927
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
928
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len) const override
|
929
|
-
{
|
930
|
-
return make_unique<_DocType>(this->_makeRawDoc(rawStr, words, pos, len));
|
918
|
+
return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc));
|
931
919
|
}
|
932
920
|
|
933
921
|
void setWordPrior(const std::string& word, const std::vector<Float>& priors) override
|
@@ -23,21 +23,6 @@ namespace tomoto
|
|
23
23
|
Float alpha = 0.1, Float eta = 0.01, size_t seed = std::random_device{}(),
|
24
24
|
bool scalarRng = false);
|
25
25
|
|
26
|
-
virtual size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& label) = 0;
|
27
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& label) const = 0;
|
28
|
-
|
29
|
-
virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
30
|
-
const std::vector<std::string>& label) = 0;
|
31
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
32
|
-
const std::vector<std::string>& label) const = 0;
|
33
|
-
|
34
|
-
virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
35
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
36
|
-
const std::vector<std::string>& label) = 0;
|
37
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
38
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
39
|
-
const std::vector<std::string>& label) const = 0;
|
40
|
-
|
41
26
|
virtual const Dictionary& getTopicLabelDict() const = 0;
|
42
27
|
|
43
28
|
virtual size_t getNumTopicsPerLabel() const = 0;
|
@@ -144,46 +144,28 @@ namespace tomoto
|
|
144
144
|
return doc;
|
145
145
|
}
|
146
146
|
|
147
|
-
size_t addDoc(const
|
147
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
|
148
148
|
{
|
149
|
-
auto doc = this->
|
150
|
-
return this->_addDoc(_updateDoc(doc, labels));
|
149
|
+
auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
|
150
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
|
151
151
|
}
|
152
152
|
|
153
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
153
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
|
154
154
|
{
|
155
|
-
auto doc = as_mutable(this)->template
|
156
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
|
155
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
|
156
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
|
157
157
|
}
|
158
158
|
|
159
|
-
size_t addDoc(const
|
160
|
-
const std::vector<std::string>& labels) override
|
159
|
+
size_t addDoc(const RawDoc& rawDoc) override
|
161
160
|
{
|
162
|
-
auto doc = this->
|
163
|
-
return this->_addDoc(_updateDoc(doc, labels));
|
161
|
+
auto doc = this->_makeFromRawDoc(rawDoc);
|
162
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
|
164
163
|
}
|
165
164
|
|
166
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
167
|
-
const std::vector<std::string>& labels) const override
|
165
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
|
168
166
|
{
|
169
|
-
auto doc = as_mutable(this)->template
|
170
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
|
171
|
-
}
|
172
|
-
|
173
|
-
size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
174
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
175
|
-
const std::vector<std::string>& labels) override
|
176
|
-
{
|
177
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
178
|
-
return this->_addDoc(_updateDoc(doc, labels));
|
179
|
-
}
|
180
|
-
|
181
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
182
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
183
|
-
const std::vector<std::string>& labels) const override
|
184
|
-
{
|
185
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
186
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, labels));
|
167
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
|
168
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
|
187
169
|
}
|
188
170
|
|
189
171
|
std::vector<Float> getTopicsByDoc(const _DocType& doc) const
|
@@ -37,21 +37,6 @@ namespace tomoto
|
|
37
37
|
Float _etaG = 0.01, Float _etaL = 0.01, Float _gamma = 0.1, size_t seed = std::random_device{}(),
|
38
38
|
bool scalarRng = false);
|
39
39
|
|
40
|
-
virtual size_t addDoc(const std::vector<std::string>& words, const std::string& delimiter) = 0;
|
41
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::string& delimiter) const = 0;
|
42
|
-
|
43
|
-
virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
44
|
-
const std::string& delimiter) = 0;
|
45
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
46
|
-
const std::string& delimiter) const = 0;
|
47
|
-
|
48
|
-
virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
49
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
50
|
-
const std::string& delimiter) = 0;
|
51
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
52
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
53
|
-
const std::string& delimiter) const = 0;
|
54
|
-
|
55
40
|
virtual size_t getKL() const = 0;
|
56
41
|
virtual size_t getT() const = 0;
|
57
42
|
virtual Float getAlphaL() const = 0;
|
@@ -385,53 +385,12 @@ namespace tomoto
|
|
385
385
|
if (_etaL <= 0) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong etaL value (etaL = %f)", _etaL));
|
386
386
|
}
|
387
387
|
|
388
|
-
|
389
|
-
template<bool _const = false>
|
390
|
-
_DocType _makeDoc(const std::vector<std::string>& words, const std::string& delimiter)
|
391
|
-
{
|
392
|
-
_DocType doc{ 1.f };
|
393
|
-
size_t numSent = 0;
|
394
|
-
for (auto& w : words)
|
395
|
-
{
|
396
|
-
if (w == delimiter)
|
397
|
-
{
|
398
|
-
++numSent;
|
399
|
-
continue;
|
400
|
-
}
|
401
|
-
|
402
|
-
Vid id;
|
403
|
-
if (_const)
|
404
|
-
{
|
405
|
-
id = this->dict.toWid(w);
|
406
|
-
if (id == (Vid)-1) continue;
|
407
|
-
}
|
408
|
-
else
|
409
|
-
{
|
410
|
-
id = this->dict.add(w);
|
411
|
-
}
|
412
|
-
doc.words.emplace_back(id);
|
413
|
-
doc.sents.emplace_back(numSent);
|
414
|
-
}
|
415
|
-
doc.numBySent.resize(doc.sents.empty() ? 0 : (doc.sents.back() + 1));
|
416
|
-
return doc;
|
417
|
-
}
|
418
|
-
|
419
|
-
size_t addDoc(const std::vector<std::string>& words, const std::string& delimiter) override
|
420
|
-
{
|
421
|
-
return this->_addDoc(_makeDoc(words, delimiter));
|
422
|
-
}
|
423
|
-
|
424
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::string& delimiter) const override
|
425
|
-
{
|
426
|
-
return make_unique<_DocType>(as_mutable(this)->template _makeDoc<true>(words, delimiter));
|
427
|
-
}
|
428
|
-
|
429
388
|
template<bool _const, typename _FnTokenizer>
|
430
|
-
_DocType
|
389
|
+
_DocType _makeFromRawDoc(const RawDoc& rawDoc, _FnTokenizer&& tokenizer, const std::string& delimiter)
|
431
390
|
{
|
432
|
-
_DocType doc
|
391
|
+
_DocType doc;
|
433
392
|
size_t numSent = 0;
|
434
|
-
doc.rawStr = rawStr;
|
393
|
+
doc.rawStr = rawDoc.rawStr;
|
435
394
|
for (auto& p : tokenizer(doc.rawStr))
|
436
395
|
{
|
437
396
|
if (std::get<0>(p) == delimiter)
|
@@ -461,57 +420,85 @@ namespace tomoto
|
|
461
420
|
return doc;
|
462
421
|
}
|
463
422
|
|
464
|
-
size_t addDoc(const
|
465
|
-
const std::string& delimiter)
|
423
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer)
|
466
424
|
{
|
467
|
-
return this->_addDoc(
|
425
|
+
return this->_addDoc(_makeFromRawDoc<false>(rawDoc, tokenizer, rawDoc.template getMisc<std::string>("delimiter")));
|
468
426
|
}
|
469
427
|
|
470
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
471
|
-
const std::string& delimiter) const
|
428
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const
|
472
429
|
{
|
473
|
-
return make_unique<_DocType>(as_mutable(this)->template
|
430
|
+
return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer, rawDoc.template getMisc<std::string>("delimiter")));
|
474
431
|
}
|
475
432
|
|
476
|
-
|
477
|
-
|
433
|
+
template<bool _const = false>
|
434
|
+
_DocType _makeFromRawDoc(const RawDoc& rawDoc)
|
478
435
|
{
|
479
|
-
_DocType doc
|
480
|
-
doc.rawStr = rawStr;
|
436
|
+
_DocType doc;
|
437
|
+
doc.rawStr = rawDoc.rawStr;
|
438
|
+
auto delimiter = rawDoc.template getMisc<std::string>("delimiter");
|
481
439
|
size_t numSent = 0;
|
482
440
|
Vid delimiterId = this->dict.toWid(delimiter);
|
483
|
-
|
441
|
+
if (!rawDoc.rawWords.empty())
|
484
442
|
{
|
485
|
-
|
486
|
-
if (w == delimiterId)
|
443
|
+
for (size_t i = 0; i < rawDoc.rawWords.size(); ++i)
|
487
444
|
{
|
488
|
-
|
489
|
-
|
445
|
+
auto& w = rawDoc.rawWords[i];
|
446
|
+
if (w == delimiter)
|
447
|
+
{
|
448
|
+
++numSent;
|
449
|
+
continue;
|
450
|
+
}
|
451
|
+
|
452
|
+
Vid id;
|
453
|
+
if (_const)
|
454
|
+
{
|
455
|
+
id = this->dict.toWid(w);
|
456
|
+
if (id == (Vid)-1) continue;
|
457
|
+
}
|
458
|
+
else
|
459
|
+
{
|
460
|
+
id = this->dict.add(w);
|
461
|
+
}
|
462
|
+
doc.words.emplace_back(id);
|
463
|
+
doc.sents.emplace_back(numSent);
|
464
|
+
if (rawDoc.rawWords.size() == rawDoc.origWordPos.size())
|
465
|
+
{
|
466
|
+
doc.origWordPos.emplace_back(rawDoc.origWordPos[i]);
|
467
|
+
doc.origWordLen.emplace_back(rawDoc.origWordLen[i]);
|
468
|
+
}
|
490
469
|
}
|
491
|
-
|
492
|
-
|
493
|
-
|
470
|
+
}
|
471
|
+
else if (!rawDoc.words.empty())
|
472
|
+
{
|
473
|
+
for (size_t i = 0; i < rawDoc.words.size(); ++i)
|
494
474
|
{
|
495
|
-
|
496
|
-
|
475
|
+
auto& w = rawDoc.words[i];
|
476
|
+
if (w == delimiterId)
|
477
|
+
{
|
478
|
+
++numSent;
|
479
|
+
continue;
|
480
|
+
}
|
481
|
+
doc.words.emplace_back(w);
|
482
|
+
doc.sents.emplace_back(numSent);
|
483
|
+
if (rawDoc.words.size() == rawDoc.origWordPos.size())
|
484
|
+
{
|
485
|
+
doc.origWordPos.emplace_back(rawDoc.origWordPos[i]);
|
486
|
+
doc.origWordLen.emplace_back(rawDoc.origWordLen[i]);
|
487
|
+
}
|
497
488
|
}
|
498
489
|
}
|
499
490
|
doc.numBySent.resize(doc.sents.empty() ? 0 : (doc.sents.back() + 1));
|
500
491
|
return doc;
|
501
492
|
}
|
502
493
|
|
503
|
-
size_t addDoc(const
|
504
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
505
|
-
const std::string& delimiter)
|
494
|
+
size_t addDoc(const RawDoc& rawDoc)
|
506
495
|
{
|
507
|
-
return this->_addDoc(
|
496
|
+
return this->_addDoc(_makeFromRawDoc(rawDoc));
|
508
497
|
}
|
509
498
|
|
510
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
511
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
512
|
-
const std::string& delimiter) const
|
499
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const
|
513
500
|
{
|
514
|
-
return make_unique<_DocType>(
|
501
|
+
return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc));
|
515
502
|
}
|
516
503
|
|
517
504
|
void setWordPrior(const std::string& word, const std::vector<Float>& priors) override
|