tomoto 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/README.md +3 -3
- data/ext/tomoto/ext.cpp +34 -9
- data/ext/tomoto/extconf.rb +2 -1
- data/lib/tomoto/dmr.rb +1 -1
- data/lib/tomoto/gdmr.rb +1 -1
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/LICENSE +1 -1
- data/vendor/tomotopy/README.kr.rst +32 -3
- data/vendor/tomotopy/README.rst +30 -1
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
- data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
- data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
- data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
- data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
- data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
- data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
- data/vendor/tomotopy/src/Utils/math.h +1 -1
- data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
- data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
- data/vendor/variant/LICENSE +25 -0
- data/vendor/variant/LICENSE_1_0.txt +23 -0
- data/vendor/variant/README.md +102 -0
- data/vendor/variant/include/mapbox/optional.hpp +74 -0
- data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
- data/vendor/variant/include/mapbox/variant.hpp +974 -0
- data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
- metadata +15 -7
@@ -16,12 +16,163 @@ namespace tomoto
|
|
16
16
|
{
|
17
17
|
namespace label
|
18
18
|
{
|
19
|
+
template<typename _DocIter, typename _Freqs>
|
20
|
+
std::vector<Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
|
21
|
+
_Freqs&& vocabFreqs, _Freqs&& vocabDf,
|
22
|
+
size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore)
|
23
|
+
{
|
24
|
+
struct vvhash
|
25
|
+
{
|
26
|
+
size_t operator()(const std::pair<Vid, Vid>& k) const
|
27
|
+
{
|
28
|
+
return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
|
29
|
+
}
|
30
|
+
};
|
31
|
+
|
32
|
+
// counting unigrams & bigrams
|
33
|
+
std::unordered_map<std::pair<Vid, Vid>, size_t, vvhash> bigramCnt, bigramDf;
|
34
|
+
|
35
|
+
for(auto docIt = docBegin; docIt != docEnd; ++docIt)
|
36
|
+
{
|
37
|
+
std::unordered_set<std::pair<Vid, Vid>, vvhash> uniqBigram;
|
38
|
+
auto doc = *docIt;
|
39
|
+
Vid prevWord = doc[0];
|
40
|
+
for (size_t j = 1; j < doc.size(); ++j)
|
41
|
+
{
|
42
|
+
Vid curWord = doc[j];
|
43
|
+
if (curWord != non_vocab_id && vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
|
44
|
+
{
|
45
|
+
if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
|
46
|
+
{
|
47
|
+
bigramCnt[std::make_pair(prevWord, curWord)]++;
|
48
|
+
uniqBigram.emplace(prevWord, curWord);
|
49
|
+
}
|
50
|
+
}
|
51
|
+
prevWord = curWord;
|
52
|
+
}
|
53
|
+
|
54
|
+
for (auto& p : uniqBigram) bigramDf[p]++;
|
55
|
+
}
|
56
|
+
|
57
|
+
|
58
|
+
// counting ngrams
|
59
|
+
std::vector<TrieEx<Vid, size_t>> trieNodes;
|
60
|
+
|
61
|
+
if (maxNgrams > 2)
|
62
|
+
{
|
63
|
+
std::unordered_set<std::pair<Vid, Vid>, vvhash> validPair;
|
64
|
+
for (auto& p : bigramCnt)
|
65
|
+
{
|
66
|
+
if (p.second >= candMinCnt) validPair.emplace(p.first);
|
67
|
+
}
|
68
|
+
|
69
|
+
trieNodes.resize(1);
|
70
|
+
auto allocNode = [&]() { return trieNodes.emplace_back(), & trieNodes.back(); };
|
71
|
+
|
72
|
+
for (auto docIt = docBegin; docIt != docEnd; ++docIt)
|
73
|
+
{
|
74
|
+
auto doc = *docIt;
|
75
|
+
if (trieNodes.capacity() < trieNodes.size() + doc.size() * maxNgrams)
|
76
|
+
{
|
77
|
+
trieNodes.reserve(std::max(trieNodes.size() + doc.size() * maxNgrams, trieNodes.capacity() * 2));
|
78
|
+
}
|
79
|
+
|
80
|
+
Vid prevWord = doc[0];
|
81
|
+
size_t labelLen = 0;
|
82
|
+
auto node = &trieNodes[0];
|
83
|
+
if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt)
|
84
|
+
{
|
85
|
+
node = trieNodes[0].makeNext(prevWord, allocNode);
|
86
|
+
node->val++;
|
87
|
+
labelLen = 1;
|
88
|
+
}
|
89
|
+
|
90
|
+
for (size_t j = 1; j < doc.size(); ++j)
|
91
|
+
{
|
92
|
+
Vid curWord = doc[j];
|
93
|
+
|
94
|
+
if (curWord != non_vocab_id && vocabFreqs[curWord] < candMinCnt)
|
95
|
+
{
|
96
|
+
node = &trieNodes[0];
|
97
|
+
labelLen = 0;
|
98
|
+
}
|
99
|
+
else
|
100
|
+
{
|
101
|
+
if (labelLen >= maxNgrams)
|
102
|
+
{
|
103
|
+
node = node->getFail();
|
104
|
+
labelLen--;
|
105
|
+
}
|
106
|
+
|
107
|
+
if (validPair.count(std::make_pair(prevWord, curWord)))
|
108
|
+
{
|
109
|
+
auto nnode = node->makeNext(curWord, allocNode);
|
110
|
+
node = nnode;
|
111
|
+
do
|
112
|
+
{
|
113
|
+
nnode->val++;
|
114
|
+
} while (nnode = nnode->getFail());
|
115
|
+
labelLen++;
|
116
|
+
}
|
117
|
+
else
|
118
|
+
{
|
119
|
+
node = trieNodes[0].makeNext(curWord, allocNode);
|
120
|
+
node->val++;
|
121
|
+
labelLen = 1;
|
122
|
+
}
|
123
|
+
}
|
124
|
+
prevWord = curWord;
|
125
|
+
}
|
126
|
+
}
|
127
|
+
}
|
128
|
+
|
129
|
+
float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
|
130
|
+
|
131
|
+
// calculating PMIs
|
132
|
+
std::vector<Candidate> candidates;
|
133
|
+
for (auto& p : bigramCnt)
|
134
|
+
{
|
135
|
+
auto& bigram = p.first;
|
136
|
+
if (p.second < candMinCnt) continue;
|
137
|
+
if (bigramDf[bigram] < candMinDf) continue;
|
138
|
+
auto pmi = std::log(p.second * totN
|
139
|
+
/ vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
|
140
|
+
if (pmi <= 0) continue;
|
141
|
+
candidates.emplace_back(pmi, bigram.first, bigram.second);
|
142
|
+
}
|
143
|
+
|
144
|
+
if (maxNgrams > 2)
|
145
|
+
{
|
146
|
+
std::vector<Vid> rkeys;
|
147
|
+
trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
|
148
|
+
{
|
149
|
+
if (rkeys.size() <= 2 || rkeys.size() < minNgrams || node->val < candMinCnt) return;
|
150
|
+
auto pmi = node->val / totN;
|
151
|
+
for (auto k : rkeys)
|
152
|
+
{
|
153
|
+
pmi *= totN / vocabFreqs[k];
|
154
|
+
}
|
155
|
+
pmi = std::log(pmi);
|
156
|
+
if (pmi < minScore) return;
|
157
|
+
candidates.emplace_back(pmi, rkeys);
|
158
|
+
}, rkeys);
|
159
|
+
}
|
160
|
+
|
161
|
+
std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
|
162
|
+
{
|
163
|
+
return a.score > b.score;
|
164
|
+
});
|
165
|
+
if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
|
166
|
+
return candidates;
|
167
|
+
}
|
168
|
+
|
169
|
+
|
19
170
|
class PMIExtractor : public IExtractor
|
20
171
|
{
|
21
|
-
size_t candMinCnt, candMinDf, maxLabelLen, maxCandidates;
|
172
|
+
size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
|
22
173
|
public:
|
23
|
-
PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
|
24
|
-
: candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
|
174
|
+
PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
|
175
|
+
: candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen}, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
|
25
176
|
{
|
26
177
|
}
|
27
178
|
|
@@ -33,7 +184,7 @@ namespace tomoto
|
|
33
184
|
struct CandidateEx : public Candidate
|
34
185
|
{
|
35
186
|
std::unordered_map<std::string, size_t> names;
|
36
|
-
std::
|
187
|
+
std::set<size_t> docIds;
|
37
188
|
Eigen::Array<Float, -1, 1> scores;
|
38
189
|
|
39
190
|
CandidateEx()
|
@@ -49,6 +200,7 @@ namespace tomoto
|
|
49
200
|
const ITopicModel* tm;
|
50
201
|
size_t candMinDf;
|
51
202
|
float smoothing, lambda, mu;
|
203
|
+
size_t windowSize;
|
52
204
|
std::unique_ptr<ThreadPool> pool;
|
53
205
|
std::unique_ptr<std::mutex[]> mtx;
|
54
206
|
std::vector<CandidateEx> candidates;
|
@@ -63,9 +215,10 @@ namespace tomoto
|
|
63
215
|
FoRelevance(const ITopicModel* _tm,
|
64
216
|
_Iter candFirst, _Iter candEnd,
|
65
217
|
size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f,
|
218
|
+
size_t _windowSize = (size_t)-1,
|
66
219
|
size_t numWorkers = 0)
|
67
220
|
: tm{ _tm }, candMinDf{ _candMinDf },
|
68
|
-
smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }
|
221
|
+
smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }, windowSize{ _windowSize }
|
69
222
|
{
|
70
223
|
if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
|
71
224
|
if (numWorkers > 1)
|
@@ -8,7 +8,7 @@ namespace tomoto
|
|
8
8
|
{
|
9
9
|
using BaseDocument = DocumentLDA<_tw>;
|
10
10
|
using DocumentLDA<_tw>::DocumentLDA;
|
11
|
-
|
11
|
+
uint64_t metadata = 0;
|
12
12
|
|
13
13
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata);
|
14
14
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata);
|
@@ -23,21 +23,6 @@ namespace tomoto
|
|
23
23
|
size_t seed = std::random_device{}(),
|
24
24
|
bool scalarRng = false);
|
25
25
|
|
26
|
-
virtual size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) = 0;
|
27
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) const = 0;
|
28
|
-
|
29
|
-
virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
30
|
-
const std::vector<std::string>& metadata) = 0;
|
31
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
32
|
-
const std::vector<std::string>& metadata) const = 0;
|
33
|
-
|
34
|
-
virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
35
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
36
|
-
const std::vector<std::string>& metadata) = 0;
|
37
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
38
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
39
|
-
const std::vector<std::string>& metadata) const = 0;
|
40
|
-
|
41
26
|
virtual void setAlphaEps(Float _alphaEps) = 0;
|
42
27
|
virtual Float getAlphaEps() const = 0;
|
43
28
|
virtual void setOptimRepeat(size_t repeat) = 0;
|
@@ -262,63 +262,44 @@ namespace tomoto
|
|
262
262
|
}
|
263
263
|
|
264
264
|
template<bool _const = false>
|
265
|
-
_DocType& _updateDoc(_DocType& doc, const std::
|
265
|
+
_DocType& _updateDoc(_DocType& doc, const std::string& metadata)
|
266
266
|
{
|
267
|
-
std::string metadataJoined = text::join(metadata.begin(), metadata.end(), "_");
|
268
267
|
Vid xid;
|
269
268
|
if (_const)
|
270
269
|
{
|
271
|
-
xid = metadataDict.toWid(
|
270
|
+
xid = metadataDict.toWid(metadata);
|
272
271
|
if (xid == (Vid)-1) throw std::invalid_argument("unknown metadata");
|
273
272
|
}
|
274
273
|
else
|
275
274
|
{
|
276
|
-
xid = metadataDict.add(
|
275
|
+
xid = metadataDict.add(metadata);
|
277
276
|
}
|
278
277
|
doc.metadata = xid;
|
279
278
|
return doc;
|
280
279
|
}
|
281
280
|
|
282
|
-
size_t addDoc(const
|
281
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
|
283
282
|
{
|
284
|
-
auto doc = this->
|
285
|
-
return this->_addDoc(_updateDoc(doc, metadata));
|
283
|
+
auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
|
284
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::string>("metadata")));
|
286
285
|
}
|
287
286
|
|
288
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
287
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
|
289
288
|
{
|
290
|
-
auto doc = as_mutable(this)->template
|
291
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
|
289
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
|
290
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::string>("metadata")));
|
292
291
|
}
|
293
292
|
|
294
|
-
size_t addDoc(const
|
295
|
-
const std::vector<std::string>& metadata) override
|
293
|
+
size_t addDoc(const RawDoc& rawDoc) override
|
296
294
|
{
|
297
|
-
auto doc = this->
|
298
|
-
return this->_addDoc(_updateDoc(doc, metadata));
|
295
|
+
auto doc = this->_makeFromRawDoc(rawDoc);
|
296
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::string>("metadata")));
|
299
297
|
}
|
300
298
|
|
301
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
302
|
-
const std::vector<std::string>& metadata) const override
|
299
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
|
303
300
|
{
|
304
|
-
auto doc = as_mutable(this)->template
|
305
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
|
306
|
-
}
|
307
|
-
|
308
|
-
size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
309
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
310
|
-
const std::vector<std::string>& metadata) override
|
311
|
-
{
|
312
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
313
|
-
return this->_addDoc(_updateDoc(doc, metadata));
|
314
|
-
}
|
315
|
-
|
316
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
317
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
318
|
-
const std::vector<std::string>& metadata) const override
|
319
|
-
{
|
320
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
321
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
|
301
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
|
302
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::string>("metadata")));
|
322
303
|
}
|
323
304
|
|
324
305
|
GETTER(F, size_t, F);
|
@@ -10,7 +10,7 @@ namespace tomoto
|
|
10
10
|
using BaseDocument = DocumentLDA<_tw>;
|
11
11
|
using DocumentLDA<_tw>::DocumentLDA;
|
12
12
|
|
13
|
-
|
13
|
+
uint64_t timepoint = 0;
|
14
14
|
ShareableVector<Float> eta;
|
15
15
|
sample::AliasMethod<> aliasTable;
|
16
16
|
|
@@ -29,21 +29,6 @@ namespace tomoto
|
|
29
29
|
size_t seed = std::random_device{}(),
|
30
30
|
bool scalarRng = false);
|
31
31
|
|
32
|
-
virtual size_t addDoc(const std::vector<std::string>& words, size_t timepoint) = 0;
|
33
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, size_t timepoint) const = 0;
|
34
|
-
|
35
|
-
virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
36
|
-
size_t timepoint) = 0;
|
37
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
38
|
-
size_t timepoint) const = 0;
|
39
|
-
|
40
|
-
virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
41
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
42
|
-
size_t timepoint) = 0;
|
43
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
44
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
45
|
-
size_t timepoint) const = 0;
|
46
|
-
|
47
32
|
virtual size_t getT() const = 0;
|
48
33
|
virtual std::vector<uint32_t> getNumDocsByT() const = 0;
|
49
34
|
|
@@ -468,7 +468,7 @@ namespace tomoto
|
|
468
468
|
return ret;
|
469
469
|
}
|
470
470
|
|
471
|
-
_DocType& _updateDoc(_DocType& doc,
|
471
|
+
_DocType& _updateDoc(_DocType& doc, uint32_t timepoint) const
|
472
472
|
{
|
473
473
|
if (timepoint >= T) THROW_ERROR_WITH_INFO(exception::InvalidArgument, "timepoint must < T");
|
474
474
|
doc.timepoint = timepoint;
|
@@ -512,51 +512,34 @@ namespace tomoto
|
|
512
512
|
{
|
513
513
|
}
|
514
514
|
|
515
|
-
size_t addDoc(const
|
515
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
|
516
516
|
{
|
517
|
-
auto doc = this->
|
518
|
-
return this->_addDoc(_updateDoc(doc, timepoint));
|
517
|
+
auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
|
518
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
|
519
519
|
}
|
520
520
|
|
521
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
521
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
|
522
522
|
{
|
523
|
-
auto doc = as_mutable(this)->template
|
524
|
-
return make_unique<_DocType>(_updateDoc(doc, timepoint));
|
523
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
|
524
|
+
return make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
|
525
525
|
}
|
526
526
|
|
527
|
-
size_t addDoc(const
|
528
|
-
size_t timepoint) override
|
527
|
+
size_t addDoc(const RawDoc& rawDoc) override
|
529
528
|
{
|
530
|
-
auto doc = this->
|
531
|
-
return this->_addDoc(_updateDoc(doc, timepoint));
|
529
|
+
auto doc = this->_makeFromRawDoc(rawDoc);
|
530
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
|
532
531
|
}
|
533
532
|
|
534
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
535
|
-
size_t timepoint) const override
|
533
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
|
536
534
|
{
|
537
|
-
auto doc = as_mutable(this)->template
|
538
|
-
return make_unique<_DocType>(_updateDoc(doc, timepoint));
|
539
|
-
}
|
540
|
-
|
541
|
-
size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
542
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
543
|
-
size_t timepoint) override
|
544
|
-
{
|
545
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
546
|
-
return this->_addDoc(_updateDoc(doc, timepoint));
|
547
|
-
}
|
548
|
-
|
549
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
550
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
551
|
-
size_t timepoint) const override
|
552
|
-
{
|
553
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
554
|
-
return make_unique<_DocType>(_updateDoc(doc, timepoint));
|
535
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
|
536
|
+
return make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
|
555
537
|
}
|
556
538
|
|
557
539
|
Float getAlpha(size_t k, size_t t) const override
|
558
540
|
{
|
559
|
-
return alphas(k, t);
|
541
|
+
if (alphas.size()) return alphas(k, t);
|
542
|
+
return 0;
|
560
543
|
}
|
561
544
|
|
562
545
|
std::vector<Float> getPhi(size_t k, size_t t) const override
|
@@ -353,58 +353,36 @@ namespace tomoto
|
|
353
353
|
}
|
354
354
|
|
355
355
|
template<bool _const = false>
|
356
|
-
_DocType& _updateDoc(_DocType& doc, const std::vector<
|
356
|
+
_DocType& _updateDoc(_DocType& doc, const std::vector<Float>& metadata) const
|
357
357
|
{
|
358
358
|
if (metadata.size() != degreeByF.size())
|
359
359
|
throw std::invalid_argument{ "a length of `metadata` should be equal to a length of `degrees`" };
|
360
|
-
|
361
|
-
std::transform(metadata.begin(), metadata.end(), back_inserter(doc.metadataOrg), [](const std::string& w)
|
362
|
-
{
|
363
|
-
return std::stof(w);
|
364
|
-
});
|
360
|
+
doc.metadataOrg = metadata;
|
365
361
|
return doc;
|
366
362
|
}
|
367
363
|
|
368
|
-
size_t addDoc(const
|
369
|
-
{
|
370
|
-
auto doc = this->_makeDoc(words);
|
371
|
-
return this->_addDoc(_updateDoc(doc, metadata));
|
372
|
-
}
|
373
|
-
|
374
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) const override
|
364
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
|
375
365
|
{
|
376
|
-
auto doc =
|
377
|
-
return
|
366
|
+
auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
|
367
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
|
378
368
|
}
|
379
369
|
|
380
|
-
|
381
|
-
const std::vector<std::string>& metadata) override
|
370
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
|
382
371
|
{
|
383
|
-
auto doc = this->template
|
384
|
-
return this->
|
372
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
|
373
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
|
385
374
|
}
|
386
375
|
|
387
|
-
|
388
|
-
const std::vector<std::string>& metadata) const override
|
376
|
+
size_t addDoc(const RawDoc& rawDoc) override
|
389
377
|
{
|
390
|
-
auto doc =
|
391
|
-
return
|
378
|
+
auto doc = this->_makeFromRawDoc(rawDoc);
|
379
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
|
392
380
|
}
|
393
381
|
|
394
|
-
|
395
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
396
|
-
const std::vector<std::string>& metadata) override
|
382
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
|
397
383
|
{
|
398
|
-
auto doc = this->
|
399
|
-
return this->
|
400
|
-
}
|
401
|
-
|
402
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
403
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
404
|
-
const std::vector<std::string>& metadata) const override
|
405
|
-
{
|
406
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
407
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
|
384
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
|
385
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
|
408
386
|
}
|
409
387
|
|
410
388
|
std::vector<Float> getTopicsByDoc(const _DocType& doc) const
|
@@ -428,7 +406,10 @@ namespace tomoto
|
|
428
406
|
std::vector<Float> getLambdaByTopic(Tid tid) const override
|
429
407
|
{
|
430
408
|
std::vector<Float> ret(this->F);
|
431
|
-
|
409
|
+
if (this->lambda.size())
|
410
|
+
{
|
411
|
+
Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ ret.data(), (Eigen::Index)ret.size() } = this->lambda.row(tid);
|
412
|
+
}
|
432
413
|
return ret;
|
433
414
|
}
|
434
415
|
|