tomoto 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/LICENSE.txt +1 -1
- data/README.md +7 -0
- data/ext/tomoto/ct.cpp +54 -0
- data/ext/tomoto/dmr.cpp +62 -0
- data/ext/tomoto/dt.cpp +82 -0
- data/ext/tomoto/ext.cpp +27 -773
- data/ext/tomoto/gdmr.cpp +34 -0
- data/ext/tomoto/hdp.cpp +42 -0
- data/ext/tomoto/hlda.cpp +66 -0
- data/ext/tomoto/hpa.cpp +27 -0
- data/ext/tomoto/lda.cpp +250 -0
- data/ext/tomoto/llda.cpp +29 -0
- data/ext/tomoto/mglda.cpp +71 -0
- data/ext/tomoto/pa.cpp +27 -0
- data/ext/tomoto/plda.cpp +29 -0
- data/ext/tomoto/slda.cpp +40 -0
- data/ext/tomoto/utils.h +84 -0
- data/lib/tomoto/tomoto.bundle +0 -0
- data/lib/tomoto/tomoto.so +0 -0
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/README.kr.rst +12 -3
- data/vendor/tomotopy/README.rst +12 -3
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
- data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
- data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
- data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
- data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
- data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
- data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
- data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
- data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
- data/vendor/tomotopy/src/Utils/math.h +8 -4
- data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
- metadata +24 -60
@@ -121,6 +121,7 @@ namespace tomoto
|
|
121
121
|
};
|
122
122
|
|
123
123
|
enum class ParallelScheme { default_, none, copy_merge, partition, size };
|
124
|
+
enum class GlobalSampler { train, freeze_topics, inference, size };
|
124
125
|
|
125
126
|
inline const char* toString(ParallelScheme ps)
|
126
127
|
{
|
@@ -236,7 +237,7 @@ namespace tomoto
|
|
236
237
|
virtual const std::vector<uint64_t>& getVocabCf() const = 0;
|
237
238
|
virtual const std::vector<uint64_t>& getVocabDf() const = 0;
|
238
239
|
|
239
|
-
virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_) = 0;
|
240
|
+
virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_, bool freeze_topics = false) = 0;
|
240
241
|
virtual size_t getGlobalStep() const = 0;
|
241
242
|
virtual void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0) = 0;
|
242
243
|
|
@@ -588,7 +589,7 @@ namespace tomoto
|
|
588
589
|
return ps;
|
589
590
|
}
|
590
591
|
|
591
|
-
int train(size_t iteration, size_t numWorkers, ParallelScheme ps) override
|
592
|
+
int train(size_t iteration, size_t numWorkers, ParallelScheme ps, bool freeze_topics = false) override
|
592
593
|
{
|
593
594
|
if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
|
594
595
|
ps = getRealScheme(ps);
|
@@ -606,16 +607,20 @@ namespace tomoto
|
|
606
607
|
localRG.emplace_back(rg());
|
607
608
|
}
|
608
609
|
|
609
|
-
|
610
|
+
if (ps == ParallelScheme::copy_merge)
|
610
611
|
{
|
611
|
-
|
612
|
+
for (size_t i = 0; i < numWorkers; ++i)
|
613
|
+
{
|
614
|
+
localData.emplace_back(static_cast<_Derived*>(this)->globalState);
|
615
|
+
}
|
612
616
|
}
|
613
|
-
|
614
|
-
if (ps == ParallelScheme::partition)
|
617
|
+
else if (ps == ParallelScheme::partition)
|
615
618
|
{
|
616
619
|
localData.resize(numWorkers);
|
617
|
-
static_cast<_Derived*>(this)->updatePartition(
|
618
|
-
|
620
|
+
static_cast<_Derived*>(this)->updatePartition(
|
621
|
+
*cachedPool, globalState, localData.data(), docs.begin(), docs.end(),
|
622
|
+
static_cast<_Derived*>(this)->eddTrain
|
623
|
+
);
|
619
624
|
}
|
620
625
|
|
621
626
|
auto state = ps == ParallelScheme::none ? &globalState : localData.data();
|
@@ -629,15 +634,15 @@ namespace tomoto
|
|
629
634
|
{
|
630
635
|
case ParallelScheme::none:
|
631
636
|
static_cast<_Derived*>(this)->template trainOne<ParallelScheme::none>(
|
632
|
-
*cachedPool, state, localRG.data());
|
637
|
+
*cachedPool, state, localRG.data(), freeze_topics);
|
633
638
|
break;
|
634
639
|
case ParallelScheme::copy_merge:
|
635
640
|
static_cast<_Derived*>(this)->template trainOne<ParallelScheme::copy_merge>(
|
636
|
-
*cachedPool, state, localRG.data());
|
641
|
+
*cachedPool, state, localRG.data(), freeze_topics);
|
637
642
|
break;
|
638
643
|
case ParallelScheme::partition:
|
639
644
|
static_cast<_Derived*>(this)->template trainOne<ParallelScheme::partition>(
|
640
|
-
*cachedPool, state, localRG.data());
|
645
|
+
*cachedPool, state, localRG.data(), freeze_topics);
|
641
646
|
break;
|
642
647
|
}
|
643
648
|
break;
|
@@ -51,8 +51,9 @@ namespace tomoto
|
|
51
51
|
}
|
52
52
|
if (len > 1) newDist.cov /= len - 1;
|
53
53
|
}
|
54
|
-
|
55
|
-
newDist.
|
54
|
+
Eigen::MatrixXd l = newDist.cov.template cast<double>().llt().matrixL();
|
55
|
+
newDist.l = l.template cast<float>();
|
56
|
+
newDist.logDet = l.diagonal().array().log().sum();
|
56
57
|
return newDist;
|
57
58
|
}
|
58
59
|
|
@@ -26,10 +26,31 @@ namespace tomoto
|
|
26
26
|
}
|
27
27
|
};
|
28
28
|
|
29
|
+
template<class _Map, class _Node>
|
30
|
+
class TrieIterator : public _Map::const_iterator
|
31
|
+
{
|
32
|
+
using Base = typename _Map::const_iterator;
|
33
|
+
using Key = typename _Map::key_type;
|
34
|
+
const _Node* base = nullptr;
|
35
|
+
public:
|
36
|
+
|
37
|
+
TrieIterator(const Base& it, const _Node* _base)
|
38
|
+
: Base(it), base(_base)
|
39
|
+
{
|
40
|
+
}
|
41
|
+
|
42
|
+
std::pair<const Key, const _Node*> operator*() const
|
43
|
+
{
|
44
|
+
auto p = Base::operator*();
|
45
|
+
return std::make_pair(p.first, base + p.second);
|
46
|
+
}
|
47
|
+
};
|
48
|
+
|
29
49
|
template<class _Key, class _Value, class _KeyStore = ConstAccess<std::map<_Key, int32_t>>, class _Trie = void>
|
30
50
|
struct Trie
|
31
51
|
{
|
32
52
|
using Node = typename std::conditional<std::is_same<_Trie, void>::value, Trie, _Trie>::type;
|
53
|
+
using iterator = TrieIterator<_KeyStore, Node>;
|
33
54
|
_KeyStore next = {};
|
34
55
|
int32_t fail = 0;
|
35
56
|
_Value val = {};
|
@@ -47,13 +68,23 @@ namespace tomoto
|
|
47
68
|
return fail ? (Node*)this + fail : nullptr;
|
48
69
|
}
|
49
70
|
|
71
|
+
iterator begin() const
|
72
|
+
{
|
73
|
+
return { next.begin(), (const Node*)this };
|
74
|
+
}
|
75
|
+
|
76
|
+
iterator end() const
|
77
|
+
{
|
78
|
+
return { next.end(), (const Node*)this };
|
79
|
+
}
|
80
|
+
|
50
81
|
template<typename _TyIter, typename _FnAlloc>
|
51
|
-
|
82
|
+
Node* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
|
52
83
|
{
|
53
84
|
if (first == last)
|
54
85
|
{
|
55
86
|
if (!val) val = _val;
|
56
|
-
return;
|
87
|
+
return (Node*)this;
|
57
88
|
}
|
58
89
|
|
59
90
|
auto v = *first;
|
@@ -61,13 +92,13 @@ namespace tomoto
|
|
61
92
|
{
|
62
93
|
next[v] = alloc() - this;
|
63
94
|
}
|
64
|
-
getNext(v)->build(++first, last, _val, alloc);
|
95
|
+
return getNext(v)->build(++first, last, _val, alloc);
|
65
96
|
}
|
66
97
|
|
67
98
|
template<typename _TyIter>
|
68
99
|
Node* findNode(_TyIter begin, _TyIter end)
|
69
100
|
{
|
70
|
-
if (begin == end) return this;
|
101
|
+
if (begin == end) return (Node*)this;
|
71
102
|
auto n = getNext(*begin);
|
72
103
|
if (n) return n->findNode(++begin, end);
|
73
104
|
return nullptr;
|
@@ -173,21 +204,21 @@ namespace tomoto
|
|
173
204
|
int32_t parent = 0;
|
174
205
|
|
175
206
|
template<typename _TyIter, typename _FnAlloc>
|
176
|
-
|
207
|
+
TrieEx* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
|
177
208
|
{
|
178
209
|
if (first == last)
|
179
210
|
{
|
180
211
|
if (!this->val) this->val = _val;
|
181
|
-
return;
|
212
|
+
return this;
|
182
213
|
}
|
183
214
|
|
184
215
|
auto v = *first;
|
185
|
-
if (!getNext(v))
|
216
|
+
if (!this->getNext(v))
|
186
217
|
{
|
187
218
|
this->next[v] = alloc() - this;
|
188
219
|
this->getNext(v)->parent = -this->next[v];
|
189
220
|
}
|
190
|
-
this->getNext(v)->build(++first, last, _val, alloc);
|
221
|
+
return this->getNext(v)->build(++first, last, _val, alloc);
|
191
222
|
}
|
192
223
|
|
193
224
|
template<typename _FnAlloc>
|
@@ -14,57 +14,55 @@ namespace tomoto
|
|
14
14
|
const Eigen::Matrix<_Ty, -1, 1>& lowerBound,
|
15
15
|
const Eigen::Matrix<_Ty, -1, 1>& upperBound,
|
16
16
|
_Rng& rng,
|
17
|
-
size_t
|
17
|
+
size_t burnIn
|
18
|
+
)
|
18
19
|
{
|
19
|
-
constexpr _Ty epsilon = 1e-6;
|
20
20
|
const size_t K = ret.size();
|
21
|
-
Eigen::Matrix<_Ty, -1, 1>
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
for (size_t i = 0; i <
|
21
|
+
Eigen::Matrix<_Ty, -1, -1> l = multiNormal.getCovL();
|
22
|
+
ret = (lowerBound + upperBound) / 2;
|
23
|
+
Eigen::Matrix<_Ty, -1, 1> z = l.template triangularView<Eigen::Lower>().solve(ret - multiNormal.mean),
|
24
|
+
a = lowerBound - multiNormal.mean,
|
25
|
+
b = upperBound - multiNormal.mean,
|
26
|
+
t, at, bt;
|
27
|
+
for (size_t i = 0; i < burnIn; ++i)
|
28
28
|
{
|
29
|
-
|
30
|
-
if (i) std::shuffle(ks.begin(), ks.end(), rng);
|
31
|
-
for (size_t kx = 0; kx < K; ++kx)
|
29
|
+
for (size_t j = 0; j < K; ++j)
|
32
30
|
{
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
_Ty nLower = lowers[0], nUpper = uppers[0];
|
42
|
-
if (l(k, k) < 0) std::swap(nLower, nUpper);
|
43
|
-
if (i)
|
31
|
+
auto lj = l.col(j);
|
32
|
+
z[j] = 0;
|
33
|
+
t = l * z;
|
34
|
+
_Ty lower_pos = -INFINITY, upper_pos = INFINITY,
|
35
|
+
lower_neg = -INFINITY, upper_neg = INFINITY;
|
36
|
+
at = ((a - t).array() / lj.array()).matrix();
|
37
|
+
bt = ((b - t).array() / lj.array()).matrix();
|
38
|
+
for (size_t k = 0; k < K; ++k)
|
44
39
|
{
|
45
|
-
|
40
|
+
if (lj[k] > 0)
|
41
|
+
{
|
42
|
+
lower_pos = std::max(lower_pos, at[k]);
|
43
|
+
upper_pos = std::min(upper_pos, bt[k]);
|
44
|
+
}
|
45
|
+
else if (lj[k] < 0)
|
46
46
|
{
|
47
|
-
|
48
|
-
|
49
|
-
if (lowers[j] > nLower) nLower = lowers[j];
|
50
|
-
if (uppers[j] < nUpper) nUpper = uppers[j];
|
51
|
-
}
|
52
|
-
else if (l.col(k)(j + k) < -epsilon)
|
53
|
-
{
|
54
|
-
if (uppers[j] > nLower) nLower = uppers[j];
|
55
|
-
if (lowers[j] < nUpper) nUpper = lowers[j];
|
56
|
-
}
|
47
|
+
lower_neg = std::max(lower_neg, bt[k]);
|
48
|
+
upper_neg = std::min(upper_neg, at[k]);
|
57
49
|
}
|
58
50
|
}
|
59
|
-
|
51
|
+
lower_pos = std::max(lower_pos, lower_neg);
|
52
|
+
upper_pos = std::min(upper_pos, upper_neg);
|
53
|
+
// this is due to numerical instability
|
54
|
+
if (lower_pos >= upper_pos)
|
55
|
+
{
|
56
|
+
std::cerr << __FILE__ << "(" << __LINE__ << "): wrong truncation range [" << lower_pos << ", " << upper_pos << "]" << std::endl;
|
57
|
+
z[j] = (lower_pos + upper_pos) / 2;
|
58
|
+
}
|
60
59
|
else
|
61
60
|
{
|
62
|
-
|
61
|
+
z[j] = rtnorm::rtnorm(rng, lower_pos, upper_pos);
|
63
62
|
}
|
64
63
|
}
|
65
64
|
}
|
66
|
-
ret = l *
|
67
|
-
ret += multiNormal.mean;
|
65
|
+
ret = (l * z) + multiNormal.mean;
|
68
66
|
return ret;
|
69
67
|
}
|
70
68
|
|
@@ -70,7 +70,7 @@ namespace tomoto
|
|
70
70
|
}
|
71
71
|
|
72
72
|
template<class UnaryFunction>
|
73
|
-
UnaryFunction
|
73
|
+
UnaryFunction forShuffled(size_t N, size_t seed, UnaryFunction f)
|
74
74
|
{
|
75
75
|
static size_t primes[16] = {
|
76
76
|
65537, 65539, 65543, 65551, 65557, 65563,
|
@@ -206,132 +206,137 @@ namespace tomoto
|
|
206
206
|
}
|
207
207
|
|
208
208
|
template <typename _UnaryFunc, typename _Iterator>
|
209
|
-
class TransformIter
|
209
|
+
class TransformIter : public _Iterator
|
210
210
|
{
|
211
211
|
private:
|
212
|
-
_Iterator i;
|
213
212
|
_UnaryFunc f;
|
214
213
|
public:
|
215
214
|
using reference = typename std::result_of<
|
216
215
|
const _UnaryFunc(typename std::iterator_traits<_Iterator>::reference)
|
217
216
|
>::type;
|
218
217
|
using value_type = reference;
|
219
|
-
|
220
|
-
using pointer = void;
|
221
|
-
using iterator_category = typename std::iterator_traits<_Iterator>::iterator_category;
|
222
|
-
using difference_type = typename std::iterator_traits<_Iterator>::difference_type;
|
223
|
-
|
218
|
+
|
224
219
|
TransformIter(const _Iterator& _iter = {}, _UnaryFunc _f = {})
|
225
|
-
:
|
220
|
+
: _Iterator(_iter), f(_f)
|
226
221
|
{}
|
227
222
|
|
228
223
|
reference operator*()
|
229
224
|
{
|
230
|
-
return f(*
|
225
|
+
return f(_Iterator::operator*());
|
231
226
|
}
|
232
227
|
|
233
228
|
const reference operator*() const
|
234
229
|
{
|
235
|
-
return f(*
|
230
|
+
return f(_Iterator::operator*());
|
236
231
|
}
|
237
232
|
|
238
233
|
reference operator[](std::size_t idx)
|
239
234
|
{
|
240
|
-
return f(
|
235
|
+
return f(_Iterator::operator[](idx));
|
241
236
|
}
|
242
237
|
|
243
238
|
const reference operator[](std::size_t idx) const
|
244
239
|
{
|
245
|
-
return f(
|
240
|
+
return f(_Iterator::operator[](idx));
|
246
241
|
}
|
247
242
|
|
248
243
|
TransformIter& operator++()
|
249
244
|
{
|
250
|
-
++
|
245
|
+
_Iterator::operator++();
|
251
246
|
return *this;
|
252
247
|
}
|
253
248
|
|
254
|
-
TransformIter
|
249
|
+
TransformIter operator++(int)
|
255
250
|
{
|
256
251
|
auto c = *this;
|
257
|
-
++
|
252
|
+
_Iterator::operator++();
|
258
253
|
return c;
|
259
254
|
}
|
260
255
|
|
261
256
|
TransformIter& operator--()
|
262
257
|
{
|
263
|
-
--
|
258
|
+
_Iterator::operator--();
|
264
259
|
return *this;
|
265
260
|
}
|
266
261
|
|
267
|
-
TransformIter
|
262
|
+
TransformIter operator--(int)
|
268
263
|
{
|
269
264
|
auto c = *this;
|
270
|
-
--
|
265
|
+
_Iterator::operator--();
|
271
266
|
return c;
|
272
267
|
}
|
273
268
|
|
274
269
|
TransformIter operator+(int n) const
|
275
270
|
{
|
276
|
-
return {
|
271
|
+
return { _Iterator::operator+(n), f };
|
277
272
|
}
|
278
273
|
|
279
274
|
TransformIter operator-(int n) const
|
280
275
|
{
|
281
|
-
return {
|
276
|
+
return { _Iterator::operator-(n), f };
|
282
277
|
}
|
283
278
|
|
284
279
|
TransformIter& operator+=(int n)
|
285
280
|
{
|
286
|
-
|
281
|
+
_Iterator::operator+=(n);
|
287
282
|
return *this;
|
288
283
|
}
|
289
284
|
|
290
285
|
TransformIter& operator-=(int n)
|
291
286
|
{
|
292
|
-
|
287
|
+
_Iterator::operator-=(n);
|
293
288
|
return *this;
|
294
289
|
}
|
295
290
|
|
296
291
|
typename std::iterator_traits<_Iterator>::difference_type operator-(const TransformIter& o) const
|
297
292
|
{
|
298
|
-
return
|
293
|
+
return (const _Iterator&)*this - (const _Iterator&)o;
|
299
294
|
}
|
300
295
|
|
301
|
-
|
302
|
-
{
|
303
|
-
return i == o.i;
|
304
|
-
}
|
296
|
+
};
|
305
297
|
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
}
|
298
|
+
template <typename _UnaryFunc, typename _Iterator>
|
299
|
+
TransformIter<_UnaryFunc, _Iterator> makeTransformIter(const _Iterator& iter, _UnaryFunc f)
|
300
|
+
{
|
301
|
+
return { iter, f };
|
302
|
+
}
|
310
303
|
|
311
|
-
|
304
|
+
template <typename _Iterator>
|
305
|
+
class StrideIter : public _Iterator
|
306
|
+
{
|
307
|
+
size_t stride;
|
308
|
+
const _Iterator end;
|
309
|
+
public:
|
310
|
+
StrideIter(const _Iterator& iter, size_t _stride = 1, const _Iterator& _end = {})
|
311
|
+
: _Iterator{ iter }, stride{ _stride }, end{ _end }
|
312
312
|
{
|
313
|
-
return i < o.i;
|
314
313
|
}
|
315
314
|
|
316
|
-
|
317
|
-
|
318
|
-
return i > o.i;
|
319
|
-
}
|
315
|
+
StrideIter(const StrideIter&) = default;
|
316
|
+
StrideIter(StrideIter&&) = default;
|
320
317
|
|
321
|
-
|
318
|
+
StrideIter& operator++()
|
322
319
|
{
|
323
|
-
|
320
|
+
for (size_t i = 0; i < stride && *this != end; ++i)
|
321
|
+
{
|
322
|
+
_Iterator::operator++();
|
323
|
+
}
|
324
|
+
return *this;
|
324
325
|
}
|
325
326
|
|
326
|
-
|
327
|
+
StrideIter& operator--()
|
327
328
|
{
|
328
|
-
|
329
|
+
for (size_t i = 0; i < stride && *this != end; ++i)
|
330
|
+
{
|
331
|
+
_Iterator::operator--();
|
332
|
+
}
|
333
|
+
return *this;
|
329
334
|
}
|
330
335
|
};
|
331
336
|
|
332
|
-
template <typename
|
333
|
-
|
337
|
+
template <typename _Iterator>
|
338
|
+
StrideIter<_Iterator> makeStrideIter(const _Iterator& iter, size_t stride, const _Iterator& end = {})
|
334
339
|
{
|
335
|
-
return { iter,
|
340
|
+
return { iter, stride, end };
|
336
341
|
}
|
337
342
|
}
|