tomoto 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/README.md +3 -3
- data/ext/tomoto/ext.cpp +34 -9
- data/ext/tomoto/extconf.rb +2 -1
- data/lib/tomoto/dmr.rb +1 -1
- data/lib/tomoto/gdmr.rb +1 -1
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/LICENSE +1 -1
- data/vendor/tomotopy/README.kr.rst +32 -3
- data/vendor/tomotopy/README.rst +30 -1
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +133 -147
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +158 -5
- data/vendor/tomotopy/src/TopicModel/DMR.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +15 -34
- data/vendor/tomotopy/src/TopicModel/DT.h +1 -16
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +15 -32
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +18 -37
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +16 -20
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +3 -3
- data/vendor/tomotopy/src/TopicModel/LDA.h +0 -11
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +9 -21
- data/vendor/tomotopy/src/TopicModel/LLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +59 -72
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +12 -30
- data/vendor/tomotopy/src/TopicModel/SLDA.h +0 -15
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +17 -35
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +158 -38
- data/vendor/tomotopy/src/Utils/Dictionary.h +40 -2
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +122 -3
- data/vendor/tomotopy/src/Utils/SharedString.hpp +181 -0
- data/vendor/tomotopy/src/Utils/math.h +1 -1
- data/vendor/tomotopy/src/Utils/sample.hpp +1 -1
- data/vendor/tomotopy/src/Utils/serializer.hpp +17 -0
- data/vendor/variant/LICENSE +25 -0
- data/vendor/variant/LICENSE_1_0.txt +23 -0
- data/vendor/variant/README.md +102 -0
- data/vendor/variant/include/mapbox/optional.hpp +74 -0
- data/vendor/variant/include/mapbox/recursive_wrapper.hpp +122 -0
- data/vendor/variant/include/mapbox/variant.hpp +974 -0
- data/vendor/variant/include/mapbox/variant_io.hpp +45 -0
- metadata +15 -7
@@ -16,12 +16,163 @@ namespace tomoto
|
|
16
16
|
{
|
17
17
|
namespace label
|
18
18
|
{
|
19
|
+
template<typename _DocIter, typename _Freqs>
|
20
|
+
std::vector<Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
|
21
|
+
_Freqs&& vocabFreqs, _Freqs&& vocabDf,
|
22
|
+
size_t candMinCnt, size_t candMinDf, size_t minNgrams, size_t maxNgrams, size_t maxCandidates, float minScore)
|
23
|
+
{
|
24
|
+
struct vvhash
|
25
|
+
{
|
26
|
+
size_t operator()(const std::pair<Vid, Vid>& k) const
|
27
|
+
{
|
28
|
+
return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
|
29
|
+
}
|
30
|
+
};
|
31
|
+
|
32
|
+
// counting unigrams & bigrams
|
33
|
+
std::unordered_map<std::pair<Vid, Vid>, size_t, vvhash> bigramCnt, bigramDf;
|
34
|
+
|
35
|
+
for(auto docIt = docBegin; docIt != docEnd; ++docIt)
|
36
|
+
{
|
37
|
+
std::unordered_set<std::pair<Vid, Vid>, vvhash> uniqBigram;
|
38
|
+
auto doc = *docIt;
|
39
|
+
Vid prevWord = doc[0];
|
40
|
+
for (size_t j = 1; j < doc.size(); ++j)
|
41
|
+
{
|
42
|
+
Vid curWord = doc[j];
|
43
|
+
if (curWord != non_vocab_id && vocabFreqs[curWord] >= candMinCnt && vocabDf[curWord] >= candMinDf)
|
44
|
+
{
|
45
|
+
if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt && vocabDf[prevWord] >= candMinDf)
|
46
|
+
{
|
47
|
+
bigramCnt[std::make_pair(prevWord, curWord)]++;
|
48
|
+
uniqBigram.emplace(prevWord, curWord);
|
49
|
+
}
|
50
|
+
}
|
51
|
+
prevWord = curWord;
|
52
|
+
}
|
53
|
+
|
54
|
+
for (auto& p : uniqBigram) bigramDf[p]++;
|
55
|
+
}
|
56
|
+
|
57
|
+
|
58
|
+
// counting ngrams
|
59
|
+
std::vector<TrieEx<Vid, size_t>> trieNodes;
|
60
|
+
|
61
|
+
if (maxNgrams > 2)
|
62
|
+
{
|
63
|
+
std::unordered_set<std::pair<Vid, Vid>, vvhash> validPair;
|
64
|
+
for (auto& p : bigramCnt)
|
65
|
+
{
|
66
|
+
if (p.second >= candMinCnt) validPair.emplace(p.first);
|
67
|
+
}
|
68
|
+
|
69
|
+
trieNodes.resize(1);
|
70
|
+
auto allocNode = [&]() { return trieNodes.emplace_back(), & trieNodes.back(); };
|
71
|
+
|
72
|
+
for (auto docIt = docBegin; docIt != docEnd; ++docIt)
|
73
|
+
{
|
74
|
+
auto doc = *docIt;
|
75
|
+
if (trieNodes.capacity() < trieNodes.size() + doc.size() * maxNgrams)
|
76
|
+
{
|
77
|
+
trieNodes.reserve(std::max(trieNodes.size() + doc.size() * maxNgrams, trieNodes.capacity() * 2));
|
78
|
+
}
|
79
|
+
|
80
|
+
Vid prevWord = doc[0];
|
81
|
+
size_t labelLen = 0;
|
82
|
+
auto node = &trieNodes[0];
|
83
|
+
if (prevWord != non_vocab_id && vocabFreqs[prevWord] >= candMinCnt)
|
84
|
+
{
|
85
|
+
node = trieNodes[0].makeNext(prevWord, allocNode);
|
86
|
+
node->val++;
|
87
|
+
labelLen = 1;
|
88
|
+
}
|
89
|
+
|
90
|
+
for (size_t j = 1; j < doc.size(); ++j)
|
91
|
+
{
|
92
|
+
Vid curWord = doc[j];
|
93
|
+
|
94
|
+
if (curWord != non_vocab_id && vocabFreqs[curWord] < candMinCnt)
|
95
|
+
{
|
96
|
+
node = &trieNodes[0];
|
97
|
+
labelLen = 0;
|
98
|
+
}
|
99
|
+
else
|
100
|
+
{
|
101
|
+
if (labelLen >= maxNgrams)
|
102
|
+
{
|
103
|
+
node = node->getFail();
|
104
|
+
labelLen--;
|
105
|
+
}
|
106
|
+
|
107
|
+
if (validPair.count(std::make_pair(prevWord, curWord)))
|
108
|
+
{
|
109
|
+
auto nnode = node->makeNext(curWord, allocNode);
|
110
|
+
node = nnode;
|
111
|
+
do
|
112
|
+
{
|
113
|
+
nnode->val++;
|
114
|
+
} while (nnode = nnode->getFail());
|
115
|
+
labelLen++;
|
116
|
+
}
|
117
|
+
else
|
118
|
+
{
|
119
|
+
node = trieNodes[0].makeNext(curWord, allocNode);
|
120
|
+
node->val++;
|
121
|
+
labelLen = 1;
|
122
|
+
}
|
123
|
+
}
|
124
|
+
prevWord = curWord;
|
125
|
+
}
|
126
|
+
}
|
127
|
+
}
|
128
|
+
|
129
|
+
float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
|
130
|
+
|
131
|
+
// calculating PMIs
|
132
|
+
std::vector<Candidate> candidates;
|
133
|
+
for (auto& p : bigramCnt)
|
134
|
+
{
|
135
|
+
auto& bigram = p.first;
|
136
|
+
if (p.second < candMinCnt) continue;
|
137
|
+
if (bigramDf[bigram] < candMinDf) continue;
|
138
|
+
auto pmi = std::log(p.second * totN
|
139
|
+
/ vocabFreqs[bigram.first] / vocabFreqs[bigram.second]);
|
140
|
+
if (pmi <= 0) continue;
|
141
|
+
candidates.emplace_back(pmi, bigram.first, bigram.second);
|
142
|
+
}
|
143
|
+
|
144
|
+
if (maxNgrams > 2)
|
145
|
+
{
|
146
|
+
std::vector<Vid> rkeys;
|
147
|
+
trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
|
148
|
+
{
|
149
|
+
if (rkeys.size() <= 2 || rkeys.size() < minNgrams || node->val < candMinCnt) return;
|
150
|
+
auto pmi = node->val / totN;
|
151
|
+
for (auto k : rkeys)
|
152
|
+
{
|
153
|
+
pmi *= totN / vocabFreqs[k];
|
154
|
+
}
|
155
|
+
pmi = std::log(pmi);
|
156
|
+
if (pmi < minScore) return;
|
157
|
+
candidates.emplace_back(pmi, rkeys);
|
158
|
+
}, rkeys);
|
159
|
+
}
|
160
|
+
|
161
|
+
std::sort(candidates.begin(), candidates.end(), [](const Candidate& a, const Candidate& b)
|
162
|
+
{
|
163
|
+
return a.score > b.score;
|
164
|
+
});
|
165
|
+
if (candidates.size() > maxCandidates) candidates.erase(candidates.begin() + maxCandidates, candidates.end());
|
166
|
+
return candidates;
|
167
|
+
}
|
168
|
+
|
169
|
+
|
19
170
|
class PMIExtractor : public IExtractor
|
20
171
|
{
|
21
|
-
size_t candMinCnt, candMinDf, maxLabelLen, maxCandidates;
|
172
|
+
size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
|
22
173
|
public:
|
23
|
-
PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
|
24
|
-
: candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
|
174
|
+
PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000)
|
175
|
+
: candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen}, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
|
25
176
|
{
|
26
177
|
}
|
27
178
|
|
@@ -33,7 +184,7 @@ namespace tomoto
|
|
33
184
|
struct CandidateEx : public Candidate
|
34
185
|
{
|
35
186
|
std::unordered_map<std::string, size_t> names;
|
36
|
-
std::
|
187
|
+
std::set<size_t> docIds;
|
37
188
|
Eigen::Array<Float, -1, 1> scores;
|
38
189
|
|
39
190
|
CandidateEx()
|
@@ -49,6 +200,7 @@ namespace tomoto
|
|
49
200
|
const ITopicModel* tm;
|
50
201
|
size_t candMinDf;
|
51
202
|
float smoothing, lambda, mu;
|
203
|
+
size_t windowSize;
|
52
204
|
std::unique_ptr<ThreadPool> pool;
|
53
205
|
std::unique_ptr<std::mutex[]> mtx;
|
54
206
|
std::vector<CandidateEx> candidates;
|
@@ -63,9 +215,10 @@ namespace tomoto
|
|
63
215
|
FoRelevance(const ITopicModel* _tm,
|
64
216
|
_Iter candFirst, _Iter candEnd,
|
65
217
|
size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f,
|
218
|
+
size_t _windowSize = (size_t)-1,
|
66
219
|
size_t numWorkers = 0)
|
67
220
|
: tm{ _tm }, candMinDf{ _candMinDf },
|
68
|
-
smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }
|
221
|
+
smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }, windowSize{ _windowSize }
|
69
222
|
{
|
70
223
|
if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
|
71
224
|
if (numWorkers > 1)
|
@@ -8,7 +8,7 @@ namespace tomoto
|
|
8
8
|
{
|
9
9
|
using BaseDocument = DocumentLDA<_tw>;
|
10
10
|
using DocumentLDA<_tw>::DocumentLDA;
|
11
|
-
|
11
|
+
uint64_t metadata = 0;
|
12
12
|
|
13
13
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata);
|
14
14
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata);
|
@@ -23,21 +23,6 @@ namespace tomoto
|
|
23
23
|
size_t seed = std::random_device{}(),
|
24
24
|
bool scalarRng = false);
|
25
25
|
|
26
|
-
virtual size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) = 0;
|
27
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) const = 0;
|
28
|
-
|
29
|
-
virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
30
|
-
const std::vector<std::string>& metadata) = 0;
|
31
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
32
|
-
const std::vector<std::string>& metadata) const = 0;
|
33
|
-
|
34
|
-
virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
35
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
36
|
-
const std::vector<std::string>& metadata) = 0;
|
37
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
38
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
39
|
-
const std::vector<std::string>& metadata) const = 0;
|
40
|
-
|
41
26
|
virtual void setAlphaEps(Float _alphaEps) = 0;
|
42
27
|
virtual Float getAlphaEps() const = 0;
|
43
28
|
virtual void setOptimRepeat(size_t repeat) = 0;
|
@@ -262,63 +262,44 @@ namespace tomoto
|
|
262
262
|
}
|
263
263
|
|
264
264
|
template<bool _const = false>
|
265
|
-
_DocType& _updateDoc(_DocType& doc, const std::
|
265
|
+
_DocType& _updateDoc(_DocType& doc, const std::string& metadata)
|
266
266
|
{
|
267
|
-
std::string metadataJoined = text::join(metadata.begin(), metadata.end(), "_");
|
268
267
|
Vid xid;
|
269
268
|
if (_const)
|
270
269
|
{
|
271
|
-
xid = metadataDict.toWid(
|
270
|
+
xid = metadataDict.toWid(metadata);
|
272
271
|
if (xid == (Vid)-1) throw std::invalid_argument("unknown metadata");
|
273
272
|
}
|
274
273
|
else
|
275
274
|
{
|
276
|
-
xid = metadataDict.add(
|
275
|
+
xid = metadataDict.add(metadata);
|
277
276
|
}
|
278
277
|
doc.metadata = xid;
|
279
278
|
return doc;
|
280
279
|
}
|
281
280
|
|
282
|
-
size_t addDoc(const
|
281
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
|
283
282
|
{
|
284
|
-
auto doc = this->
|
285
|
-
return this->_addDoc(_updateDoc(doc, metadata));
|
283
|
+
auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
|
284
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::string>("metadata")));
|
286
285
|
}
|
287
286
|
|
288
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
287
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
|
289
288
|
{
|
290
|
-
auto doc = as_mutable(this)->template
|
291
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
|
289
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
|
290
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::string>("metadata")));
|
292
291
|
}
|
293
292
|
|
294
|
-
size_t addDoc(const
|
295
|
-
const std::vector<std::string>& metadata) override
|
293
|
+
size_t addDoc(const RawDoc& rawDoc) override
|
296
294
|
{
|
297
|
-
auto doc = this->
|
298
|
-
return this->_addDoc(_updateDoc(doc, metadata));
|
295
|
+
auto doc = this->_makeFromRawDoc(rawDoc);
|
296
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::string>("metadata")));
|
299
297
|
}
|
300
298
|
|
301
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
302
|
-
const std::vector<std::string>& metadata) const override
|
299
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
|
303
300
|
{
|
304
|
-
auto doc = as_mutable(this)->template
|
305
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
|
306
|
-
}
|
307
|
-
|
308
|
-
size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
309
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
310
|
-
const std::vector<std::string>& metadata) override
|
311
|
-
{
|
312
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
313
|
-
return this->_addDoc(_updateDoc(doc, metadata));
|
314
|
-
}
|
315
|
-
|
316
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
317
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
318
|
-
const std::vector<std::string>& metadata) const override
|
319
|
-
{
|
320
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
321
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
|
301
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
|
302
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::string>("metadata")));
|
322
303
|
}
|
323
304
|
|
324
305
|
GETTER(F, size_t, F);
|
@@ -10,7 +10,7 @@ namespace tomoto
|
|
10
10
|
using BaseDocument = DocumentLDA<_tw>;
|
11
11
|
using DocumentLDA<_tw>::DocumentLDA;
|
12
12
|
|
13
|
-
|
13
|
+
uint64_t timepoint = 0;
|
14
14
|
ShareableVector<Float> eta;
|
15
15
|
sample::AliasMethod<> aliasTable;
|
16
16
|
|
@@ -29,21 +29,6 @@ namespace tomoto
|
|
29
29
|
size_t seed = std::random_device{}(),
|
30
30
|
bool scalarRng = false);
|
31
31
|
|
32
|
-
virtual size_t addDoc(const std::vector<std::string>& words, size_t timepoint) = 0;
|
33
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, size_t timepoint) const = 0;
|
34
|
-
|
35
|
-
virtual size_t addDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
36
|
-
size_t timepoint) = 0;
|
37
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const RawDocTokenizer::Factory& tokenizer,
|
38
|
-
size_t timepoint) const = 0;
|
39
|
-
|
40
|
-
virtual size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
41
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
42
|
-
size_t timepoint) = 0;
|
43
|
-
virtual std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
44
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
45
|
-
size_t timepoint) const = 0;
|
46
|
-
|
47
32
|
virtual size_t getT() const = 0;
|
48
33
|
virtual std::vector<uint32_t> getNumDocsByT() const = 0;
|
49
34
|
|
@@ -468,7 +468,7 @@ namespace tomoto
|
|
468
468
|
return ret;
|
469
469
|
}
|
470
470
|
|
471
|
-
_DocType& _updateDoc(_DocType& doc,
|
471
|
+
_DocType& _updateDoc(_DocType& doc, uint32_t timepoint) const
|
472
472
|
{
|
473
473
|
if (timepoint >= T) THROW_ERROR_WITH_INFO(exception::InvalidArgument, "timepoint must < T");
|
474
474
|
doc.timepoint = timepoint;
|
@@ -512,51 +512,34 @@ namespace tomoto
|
|
512
512
|
{
|
513
513
|
}
|
514
514
|
|
515
|
-
size_t addDoc(const
|
515
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
|
516
516
|
{
|
517
|
-
auto doc = this->
|
518
|
-
return this->_addDoc(_updateDoc(doc, timepoint));
|
517
|
+
auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
|
518
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
|
519
519
|
}
|
520
520
|
|
521
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
521
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
|
522
522
|
{
|
523
|
-
auto doc = as_mutable(this)->template
|
524
|
-
return make_unique<_DocType>(_updateDoc(doc, timepoint));
|
523
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
|
524
|
+
return make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
|
525
525
|
}
|
526
526
|
|
527
|
-
size_t addDoc(const
|
528
|
-
size_t timepoint) override
|
527
|
+
size_t addDoc(const RawDoc& rawDoc) override
|
529
528
|
{
|
530
|
-
auto doc = this->
|
531
|
-
return this->_addDoc(_updateDoc(doc, timepoint));
|
529
|
+
auto doc = this->_makeFromRawDoc(rawDoc);
|
530
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
|
532
531
|
}
|
533
532
|
|
534
|
-
std::unique_ptr<DocumentBase> makeDoc(const
|
535
|
-
size_t timepoint) const override
|
533
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
|
536
534
|
{
|
537
|
-
auto doc = as_mutable(this)->template
|
538
|
-
return make_unique<_DocType>(_updateDoc(doc, timepoint));
|
539
|
-
}
|
540
|
-
|
541
|
-
size_t addDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
542
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
543
|
-
size_t timepoint) override
|
544
|
-
{
|
545
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
546
|
-
return this->_addDoc(_updateDoc(doc, timepoint));
|
547
|
-
}
|
548
|
-
|
549
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
550
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
551
|
-
size_t timepoint) const override
|
552
|
-
{
|
553
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
554
|
-
return make_unique<_DocType>(_updateDoc(doc, timepoint));
|
535
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
|
536
|
+
return make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc<uint32_t>("timepoint")));
|
555
537
|
}
|
556
538
|
|
557
539
|
Float getAlpha(size_t k, size_t t) const override
|
558
540
|
{
|
559
|
-
return alphas(k, t);
|
541
|
+
if (alphas.size()) return alphas(k, t);
|
542
|
+
return 0;
|
560
543
|
}
|
561
544
|
|
562
545
|
std::vector<Float> getPhi(size_t k, size_t t) const override
|
@@ -353,58 +353,36 @@ namespace tomoto
|
|
353
353
|
}
|
354
354
|
|
355
355
|
template<bool _const = false>
|
356
|
-
_DocType& _updateDoc(_DocType& doc, const std::vector<
|
356
|
+
_DocType& _updateDoc(_DocType& doc, const std::vector<Float>& metadata) const
|
357
357
|
{
|
358
358
|
if (metadata.size() != degreeByF.size())
|
359
359
|
throw std::invalid_argument{ "a length of `metadata` should be equal to a length of `degrees`" };
|
360
|
-
|
361
|
-
std::transform(metadata.begin(), metadata.end(), back_inserter(doc.metadataOrg), [](const std::string& w)
|
362
|
-
{
|
363
|
-
return std::stof(w);
|
364
|
-
});
|
360
|
+
doc.metadataOrg = metadata;
|
365
361
|
return doc;
|
366
362
|
}
|
367
363
|
|
368
|
-
size_t addDoc(const
|
369
|
-
{
|
370
|
-
auto doc = this->_makeDoc(words);
|
371
|
-
return this->_addDoc(_updateDoc(doc, metadata));
|
372
|
-
}
|
373
|
-
|
374
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) const override
|
364
|
+
size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
|
375
365
|
{
|
376
|
-
auto doc =
|
377
|
-
return
|
366
|
+
auto doc = this->template _makeFromRawDoc<false>(rawDoc, tokenizer);
|
367
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
|
378
368
|
}
|
379
369
|
|
380
|
-
|
381
|
-
const std::vector<std::string>& metadata) override
|
370
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
|
382
371
|
{
|
383
|
-
auto doc = this->template
|
384
|
-
return this->
|
372
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
|
373
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
|
385
374
|
}
|
386
375
|
|
387
|
-
|
388
|
-
const std::vector<std::string>& metadata) const override
|
376
|
+
size_t addDoc(const RawDoc& rawDoc) override
|
389
377
|
{
|
390
|
-
auto doc =
|
391
|
-
return
|
378
|
+
auto doc = this->_makeFromRawDoc(rawDoc);
|
379
|
+
return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
|
392
380
|
}
|
393
381
|
|
394
|
-
|
395
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
396
|
-
const std::vector<std::string>& metadata) override
|
382
|
+
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
|
397
383
|
{
|
398
|
-
auto doc = this->
|
399
|
-
return this->
|
400
|
-
}
|
401
|
-
|
402
|
-
std::unique_ptr<DocumentBase> makeDoc(const std::string& rawStr, const std::vector<Vid>& words,
|
403
|
-
const std::vector<uint32_t>& pos, const std::vector<uint16_t>& len,
|
404
|
-
const std::vector<std::string>& metadata) const override
|
405
|
-
{
|
406
|
-
auto doc = this->_makeRawDoc(rawStr, words, pos, len);
|
407
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, metadata));
|
384
|
+
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
|
385
|
+
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMisc<std::vector<Float>>("metadata")));
|
408
386
|
}
|
409
387
|
|
410
388
|
std::vector<Float> getTopicsByDoc(const _DocType& doc) const
|
@@ -428,7 +406,10 @@ namespace tomoto
|
|
428
406
|
std::vector<Float> getLambdaByTopic(Tid tid) const override
|
429
407
|
{
|
430
408
|
std::vector<Float> ret(this->F);
|
431
|
-
|
409
|
+
if (this->lambda.size())
|
410
|
+
{
|
411
|
+
Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ ret.data(), (Eigen::Index)ret.size() } = this->lambda.row(tid);
|
412
|
+
}
|
432
413
|
return ret;
|
433
414
|
}
|
434
415
|
|