tomoto 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/LICENSE.txt +1 -1
- data/README.md +7 -0
- data/ext/tomoto/ct.cpp +54 -0
- data/ext/tomoto/dmr.cpp +62 -0
- data/ext/tomoto/dt.cpp +82 -0
- data/ext/tomoto/ext.cpp +27 -773
- data/ext/tomoto/gdmr.cpp +34 -0
- data/ext/tomoto/hdp.cpp +42 -0
- data/ext/tomoto/hlda.cpp +66 -0
- data/ext/tomoto/hpa.cpp +27 -0
- data/ext/tomoto/lda.cpp +250 -0
- data/ext/tomoto/llda.cpp +29 -0
- data/ext/tomoto/mglda.cpp +71 -0
- data/ext/tomoto/pa.cpp +27 -0
- data/ext/tomoto/plda.cpp +29 -0
- data/ext/tomoto/slda.cpp +40 -0
- data/ext/tomoto/utils.h +84 -0
- data/lib/tomoto/tomoto.bundle +0 -0
- data/lib/tomoto/tomoto.so +0 -0
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/README.kr.rst +12 -3
- data/vendor/tomotopy/README.rst +12 -3
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
- data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
- data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
- data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
- data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
- data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
- data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
- data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
- data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
- data/vendor/tomotopy/src/Utils/math.h +8 -4
- data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
- metadata +24 -60
@@ -121,6 +121,7 @@ namespace tomoto
|
|
121
121
|
};
|
122
122
|
|
123
123
|
enum class ParallelScheme { default_, none, copy_merge, partition, size };
|
124
|
+
enum class GlobalSampler { train, freeze_topics, inference, size };
|
124
125
|
|
125
126
|
inline const char* toString(ParallelScheme ps)
|
126
127
|
{
|
@@ -236,7 +237,7 @@ namespace tomoto
|
|
236
237
|
virtual const std::vector<uint64_t>& getVocabCf() const = 0;
|
237
238
|
virtual const std::vector<uint64_t>& getVocabDf() const = 0;
|
238
239
|
|
239
|
-
virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_) = 0;
|
240
|
+
virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_, bool freeze_topics = false) = 0;
|
240
241
|
virtual size_t getGlobalStep() const = 0;
|
241
242
|
virtual void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0) = 0;
|
242
243
|
|
@@ -588,7 +589,7 @@ namespace tomoto
|
|
588
589
|
return ps;
|
589
590
|
}
|
590
591
|
|
591
|
-
int train(size_t iteration, size_t numWorkers, ParallelScheme ps) override
|
592
|
+
int train(size_t iteration, size_t numWorkers, ParallelScheme ps, bool freeze_topics = false) override
|
592
593
|
{
|
593
594
|
if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
|
594
595
|
ps = getRealScheme(ps);
|
@@ -606,16 +607,20 @@ namespace tomoto
|
|
606
607
|
localRG.emplace_back(rg());
|
607
608
|
}
|
608
609
|
|
609
|
-
|
610
|
+
if (ps == ParallelScheme::copy_merge)
|
610
611
|
{
|
611
|
-
|
612
|
+
for (size_t i = 0; i < numWorkers; ++i)
|
613
|
+
{
|
614
|
+
localData.emplace_back(static_cast<_Derived*>(this)->globalState);
|
615
|
+
}
|
612
616
|
}
|
613
|
-
|
614
|
-
if (ps == ParallelScheme::partition)
|
617
|
+
else if (ps == ParallelScheme::partition)
|
615
618
|
{
|
616
619
|
localData.resize(numWorkers);
|
617
|
-
static_cast<_Derived*>(this)->updatePartition(
|
618
|
-
|
620
|
+
static_cast<_Derived*>(this)->updatePartition(
|
621
|
+
*cachedPool, globalState, localData.data(), docs.begin(), docs.end(),
|
622
|
+
static_cast<_Derived*>(this)->eddTrain
|
623
|
+
);
|
619
624
|
}
|
620
625
|
|
621
626
|
auto state = ps == ParallelScheme::none ? &globalState : localData.data();
|
@@ -629,15 +634,15 @@ namespace tomoto
|
|
629
634
|
{
|
630
635
|
case ParallelScheme::none:
|
631
636
|
static_cast<_Derived*>(this)->template trainOne<ParallelScheme::none>(
|
632
|
-
*cachedPool, state, localRG.data());
|
637
|
+
*cachedPool, state, localRG.data(), freeze_topics);
|
633
638
|
break;
|
634
639
|
case ParallelScheme::copy_merge:
|
635
640
|
static_cast<_Derived*>(this)->template trainOne<ParallelScheme::copy_merge>(
|
636
|
-
*cachedPool, state, localRG.data());
|
641
|
+
*cachedPool, state, localRG.data(), freeze_topics);
|
637
642
|
break;
|
638
643
|
case ParallelScheme::partition:
|
639
644
|
static_cast<_Derived*>(this)->template trainOne<ParallelScheme::partition>(
|
640
|
-
*cachedPool, state, localRG.data());
|
645
|
+
*cachedPool, state, localRG.data(), freeze_topics);
|
641
646
|
break;
|
642
647
|
}
|
643
648
|
break;
|
@@ -51,8 +51,9 @@ namespace tomoto
|
|
51
51
|
}
|
52
52
|
if (len > 1) newDist.cov /= len - 1;
|
53
53
|
}
|
54
|
-
|
55
|
-
newDist.
|
54
|
+
Eigen::MatrixXd l = newDist.cov.template cast<double>().llt().matrixL();
|
55
|
+
newDist.l = l.template cast<float>();
|
56
|
+
newDist.logDet = l.diagonal().array().log().sum();
|
56
57
|
return newDist;
|
57
58
|
}
|
58
59
|
|
@@ -26,10 +26,31 @@ namespace tomoto
|
|
26
26
|
}
|
27
27
|
};
|
28
28
|
|
29
|
+
template<class _Map, class _Node>
|
30
|
+
class TrieIterator : public _Map::const_iterator
|
31
|
+
{
|
32
|
+
using Base = typename _Map::const_iterator;
|
33
|
+
using Key = typename _Map::key_type;
|
34
|
+
const _Node* base = nullptr;
|
35
|
+
public:
|
36
|
+
|
37
|
+
TrieIterator(const Base& it, const _Node* _base)
|
38
|
+
: Base(it), base(_base)
|
39
|
+
{
|
40
|
+
}
|
41
|
+
|
42
|
+
std::pair<const Key, const _Node*> operator*() const
|
43
|
+
{
|
44
|
+
auto p = Base::operator*();
|
45
|
+
return std::make_pair(p.first, base + p.second);
|
46
|
+
}
|
47
|
+
};
|
48
|
+
|
29
49
|
template<class _Key, class _Value, class _KeyStore = ConstAccess<std::map<_Key, int32_t>>, class _Trie = void>
|
30
50
|
struct Trie
|
31
51
|
{
|
32
52
|
using Node = typename std::conditional<std::is_same<_Trie, void>::value, Trie, _Trie>::type;
|
53
|
+
using iterator = TrieIterator<_KeyStore, Node>;
|
33
54
|
_KeyStore next = {};
|
34
55
|
int32_t fail = 0;
|
35
56
|
_Value val = {};
|
@@ -47,13 +68,23 @@ namespace tomoto
|
|
47
68
|
return fail ? (Node*)this + fail : nullptr;
|
48
69
|
}
|
49
70
|
|
71
|
+
iterator begin() const
|
72
|
+
{
|
73
|
+
return { next.begin(), (const Node*)this };
|
74
|
+
}
|
75
|
+
|
76
|
+
iterator end() const
|
77
|
+
{
|
78
|
+
return { next.end(), (const Node*)this };
|
79
|
+
}
|
80
|
+
|
50
81
|
template<typename _TyIter, typename _FnAlloc>
|
51
|
-
|
82
|
+
Node* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
|
52
83
|
{
|
53
84
|
if (first == last)
|
54
85
|
{
|
55
86
|
if (!val) val = _val;
|
56
|
-
return;
|
87
|
+
return (Node*)this;
|
57
88
|
}
|
58
89
|
|
59
90
|
auto v = *first;
|
@@ -61,13 +92,13 @@ namespace tomoto
|
|
61
92
|
{
|
62
93
|
next[v] = alloc() - this;
|
63
94
|
}
|
64
|
-
getNext(v)->build(++first, last, _val, alloc);
|
95
|
+
return getNext(v)->build(++first, last, _val, alloc);
|
65
96
|
}
|
66
97
|
|
67
98
|
template<typename _TyIter>
|
68
99
|
Node* findNode(_TyIter begin, _TyIter end)
|
69
100
|
{
|
70
|
-
if (begin == end) return this;
|
101
|
+
if (begin == end) return (Node*)this;
|
71
102
|
auto n = getNext(*begin);
|
72
103
|
if (n) return n->findNode(++begin, end);
|
73
104
|
return nullptr;
|
@@ -173,21 +204,21 @@ namespace tomoto
|
|
173
204
|
int32_t parent = 0;
|
174
205
|
|
175
206
|
template<typename _TyIter, typename _FnAlloc>
|
176
|
-
|
207
|
+
TrieEx* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
|
177
208
|
{
|
178
209
|
if (first == last)
|
179
210
|
{
|
180
211
|
if (!this->val) this->val = _val;
|
181
|
-
return;
|
212
|
+
return this;
|
182
213
|
}
|
183
214
|
|
184
215
|
auto v = *first;
|
185
|
-
if (!getNext(v))
|
216
|
+
if (!this->getNext(v))
|
186
217
|
{
|
187
218
|
this->next[v] = alloc() - this;
|
188
219
|
this->getNext(v)->parent = -this->next[v];
|
189
220
|
}
|
190
|
-
this->getNext(v)->build(++first, last, _val, alloc);
|
221
|
+
return this->getNext(v)->build(++first, last, _val, alloc);
|
191
222
|
}
|
192
223
|
|
193
224
|
template<typename _FnAlloc>
|
@@ -14,57 +14,55 @@ namespace tomoto
|
|
14
14
|
const Eigen::Matrix<_Ty, -1, 1>& lowerBound,
|
15
15
|
const Eigen::Matrix<_Ty, -1, 1>& upperBound,
|
16
16
|
_Rng& rng,
|
17
|
-
size_t
|
17
|
+
size_t burnIn
|
18
|
+
)
|
18
19
|
{
|
19
|
-
constexpr _Ty epsilon = 1e-6;
|
20
20
|
const size_t K = ret.size();
|
21
|
-
Eigen::Matrix<_Ty, -1, 1>
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
for (size_t i = 0; i <
|
21
|
+
Eigen::Matrix<_Ty, -1, -1> l = multiNormal.getCovL();
|
22
|
+
ret = (lowerBound + upperBound) / 2;
|
23
|
+
Eigen::Matrix<_Ty, -1, 1> z = l.template triangularView<Eigen::Lower>().solve(ret - multiNormal.mean),
|
24
|
+
a = lowerBound - multiNormal.mean,
|
25
|
+
b = upperBound - multiNormal.mean,
|
26
|
+
t, at, bt;
|
27
|
+
for (size_t i = 0; i < burnIn; ++i)
|
28
28
|
{
|
29
|
-
|
30
|
-
if (i) std::shuffle(ks.begin(), ks.end(), rng);
|
31
|
-
for (size_t kx = 0; kx < K; ++kx)
|
29
|
+
for (size_t j = 0; j < K; ++j)
|
32
30
|
{
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
_Ty nLower = lowers[0], nUpper = uppers[0];
|
42
|
-
if (l(k, k) < 0) std::swap(nLower, nUpper);
|
43
|
-
if (i)
|
31
|
+
auto lj = l.col(j);
|
32
|
+
z[j] = 0;
|
33
|
+
t = l * z;
|
34
|
+
_Ty lower_pos = -INFINITY, upper_pos = INFINITY,
|
35
|
+
lower_neg = -INFINITY, upper_neg = INFINITY;
|
36
|
+
at = ((a - t).array() / lj.array()).matrix();
|
37
|
+
bt = ((b - t).array() / lj.array()).matrix();
|
38
|
+
for (size_t k = 0; k < K; ++k)
|
44
39
|
{
|
45
|
-
|
40
|
+
if (lj[k] > 0)
|
41
|
+
{
|
42
|
+
lower_pos = std::max(lower_pos, at[k]);
|
43
|
+
upper_pos = std::min(upper_pos, bt[k]);
|
44
|
+
}
|
45
|
+
else if (lj[k] < 0)
|
46
46
|
{
|
47
|
-
|
48
|
-
|
49
|
-
if (lowers[j] > nLower) nLower = lowers[j];
|
50
|
-
if (uppers[j] < nUpper) nUpper = uppers[j];
|
51
|
-
}
|
52
|
-
else if (l.col(k)(j + k) < -epsilon)
|
53
|
-
{
|
54
|
-
if (uppers[j] > nLower) nLower = uppers[j];
|
55
|
-
if (lowers[j] < nUpper) nUpper = lowers[j];
|
56
|
-
}
|
47
|
+
lower_neg = std::max(lower_neg, bt[k]);
|
48
|
+
upper_neg = std::min(upper_neg, at[k]);
|
57
49
|
}
|
58
50
|
}
|
59
|
-
|
51
|
+
lower_pos = std::max(lower_pos, lower_neg);
|
52
|
+
upper_pos = std::min(upper_pos, upper_neg);
|
53
|
+
// this is due to numerical instability
|
54
|
+
if (lower_pos >= upper_pos)
|
55
|
+
{
|
56
|
+
std::cerr << __FILE__ << "(" << __LINE__ << "): wrong truncation range [" << lower_pos << ", " << upper_pos << "]" << std::endl;
|
57
|
+
z[j] = (lower_pos + upper_pos) / 2;
|
58
|
+
}
|
60
59
|
else
|
61
60
|
{
|
62
|
-
|
61
|
+
z[j] = rtnorm::rtnorm(rng, lower_pos, upper_pos);
|
63
62
|
}
|
64
63
|
}
|
65
64
|
}
|
66
|
-
ret = l *
|
67
|
-
ret += multiNormal.mean;
|
65
|
+
ret = (l * z) + multiNormal.mean;
|
68
66
|
return ret;
|
69
67
|
}
|
70
68
|
|
@@ -70,7 +70,7 @@ namespace tomoto
|
|
70
70
|
}
|
71
71
|
|
72
72
|
template<class UnaryFunction>
|
73
|
-
UnaryFunction
|
73
|
+
UnaryFunction forShuffled(size_t N, size_t seed, UnaryFunction f)
|
74
74
|
{
|
75
75
|
static size_t primes[16] = {
|
76
76
|
65537, 65539, 65543, 65551, 65557, 65563,
|
@@ -206,132 +206,137 @@ namespace tomoto
|
|
206
206
|
}
|
207
207
|
|
208
208
|
template <typename _UnaryFunc, typename _Iterator>
|
209
|
-
class TransformIter
|
209
|
+
class TransformIter : public _Iterator
|
210
210
|
{
|
211
211
|
private:
|
212
|
-
_Iterator i;
|
213
212
|
_UnaryFunc f;
|
214
213
|
public:
|
215
214
|
using reference = typename std::result_of<
|
216
215
|
const _UnaryFunc(typename std::iterator_traits<_Iterator>::reference)
|
217
216
|
>::type;
|
218
217
|
using value_type = reference;
|
219
|
-
|
220
|
-
using pointer = void;
|
221
|
-
using iterator_category = typename std::iterator_traits<_Iterator>::iterator_category;
|
222
|
-
using difference_type = typename std::iterator_traits<_Iterator>::difference_type;
|
223
|
-
|
218
|
+
|
224
219
|
TransformIter(const _Iterator& _iter = {}, _UnaryFunc _f = {})
|
225
|
-
:
|
220
|
+
: _Iterator(_iter), f(_f)
|
226
221
|
{}
|
227
222
|
|
228
223
|
reference operator*()
|
229
224
|
{
|
230
|
-
return f(*
|
225
|
+
return f(_Iterator::operator*());
|
231
226
|
}
|
232
227
|
|
233
228
|
const reference operator*() const
|
234
229
|
{
|
235
|
-
return f(*
|
230
|
+
return f(_Iterator::operator*());
|
236
231
|
}
|
237
232
|
|
238
233
|
reference operator[](std::size_t idx)
|
239
234
|
{
|
240
|
-
return f(
|
235
|
+
return f(_Iterator::operator[](idx));
|
241
236
|
}
|
242
237
|
|
243
238
|
const reference operator[](std::size_t idx) const
|
244
239
|
{
|
245
|
-
return f(
|
240
|
+
return f(_Iterator::operator[](idx));
|
246
241
|
}
|
247
242
|
|
248
243
|
TransformIter& operator++()
|
249
244
|
{
|
250
|
-
++
|
245
|
+
_Iterator::operator++();
|
251
246
|
return *this;
|
252
247
|
}
|
253
248
|
|
254
|
-
TransformIter
|
249
|
+
TransformIter operator++(int)
|
255
250
|
{
|
256
251
|
auto c = *this;
|
257
|
-
++
|
252
|
+
_Iterator::operator++();
|
258
253
|
return c;
|
259
254
|
}
|
260
255
|
|
261
256
|
TransformIter& operator--()
|
262
257
|
{
|
263
|
-
--
|
258
|
+
_Iterator::operator--();
|
264
259
|
return *this;
|
265
260
|
}
|
266
261
|
|
267
|
-
TransformIter
|
262
|
+
TransformIter operator--(int)
|
268
263
|
{
|
269
264
|
auto c = *this;
|
270
|
-
--
|
265
|
+
_Iterator::operator--();
|
271
266
|
return c;
|
272
267
|
}
|
273
268
|
|
274
269
|
TransformIter operator+(int n) const
|
275
270
|
{
|
276
|
-
return {
|
271
|
+
return { _Iterator::operator+(n), f };
|
277
272
|
}
|
278
273
|
|
279
274
|
TransformIter operator-(int n) const
|
280
275
|
{
|
281
|
-
return {
|
276
|
+
return { _Iterator::operator-(n), f };
|
282
277
|
}
|
283
278
|
|
284
279
|
TransformIter& operator+=(int n)
|
285
280
|
{
|
286
|
-
|
281
|
+
_Iterator::operator+=(n);
|
287
282
|
return *this;
|
288
283
|
}
|
289
284
|
|
290
285
|
TransformIter& operator-=(int n)
|
291
286
|
{
|
292
|
-
|
287
|
+
_Iterator::operator-=(n);
|
293
288
|
return *this;
|
294
289
|
}
|
295
290
|
|
296
291
|
typename std::iterator_traits<_Iterator>::difference_type operator-(const TransformIter& o) const
|
297
292
|
{
|
298
|
-
return
|
293
|
+
return (const _Iterator&)*this - (const _Iterator&)o;
|
299
294
|
}
|
300
295
|
|
301
|
-
|
302
|
-
{
|
303
|
-
return i == o.i;
|
304
|
-
}
|
296
|
+
};
|
305
297
|
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
}
|
298
|
+
template <typename _UnaryFunc, typename _Iterator>
|
299
|
+
TransformIter<_UnaryFunc, _Iterator> makeTransformIter(const _Iterator& iter, _UnaryFunc f)
|
300
|
+
{
|
301
|
+
return { iter, f };
|
302
|
+
}
|
310
303
|
|
311
|
-
|
304
|
+
template <typename _Iterator>
|
305
|
+
class StrideIter : public _Iterator
|
306
|
+
{
|
307
|
+
size_t stride;
|
308
|
+
const _Iterator end;
|
309
|
+
public:
|
310
|
+
StrideIter(const _Iterator& iter, size_t _stride = 1, const _Iterator& _end = {})
|
311
|
+
: _Iterator{ iter }, stride{ _stride }, end{ _end }
|
312
312
|
{
|
313
|
-
return i < o.i;
|
314
313
|
}
|
315
314
|
|
316
|
-
|
317
|
-
|
318
|
-
return i > o.i;
|
319
|
-
}
|
315
|
+
StrideIter(const StrideIter&) = default;
|
316
|
+
StrideIter(StrideIter&&) = default;
|
320
317
|
|
321
|
-
|
318
|
+
StrideIter& operator++()
|
322
319
|
{
|
323
|
-
|
320
|
+
for (size_t i = 0; i < stride && *this != end; ++i)
|
321
|
+
{
|
322
|
+
_Iterator::operator++();
|
323
|
+
}
|
324
|
+
return *this;
|
324
325
|
}
|
325
326
|
|
326
|
-
|
327
|
+
StrideIter& operator--()
|
327
328
|
{
|
328
|
-
|
329
|
+
for (size_t i = 0; i < stride && *this != end; ++i)
|
330
|
+
{
|
331
|
+
_Iterator::operator--();
|
332
|
+
}
|
333
|
+
return *this;
|
329
334
|
}
|
330
335
|
};
|
331
336
|
|
332
|
-
template <typename
|
333
|
-
|
337
|
+
template <typename _Iterator>
|
338
|
+
StrideIter<_Iterator> makeStrideIter(const _Iterator& iter, size_t stride, const _Iterator& end = {})
|
334
339
|
{
|
335
|
-
return { iter,
|
340
|
+
return { iter, stride, end };
|
336
341
|
}
|
337
342
|
}
|