tomoto 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +7 -0
  5. data/ext/tomoto/ct.cpp +54 -0
  6. data/ext/tomoto/dmr.cpp +62 -0
  7. data/ext/tomoto/dt.cpp +82 -0
  8. data/ext/tomoto/ext.cpp +27 -773
  9. data/ext/tomoto/gdmr.cpp +34 -0
  10. data/ext/tomoto/hdp.cpp +42 -0
  11. data/ext/tomoto/hlda.cpp +66 -0
  12. data/ext/tomoto/hpa.cpp +27 -0
  13. data/ext/tomoto/lda.cpp +250 -0
  14. data/ext/tomoto/llda.cpp +29 -0
  15. data/ext/tomoto/mglda.cpp +71 -0
  16. data/ext/tomoto/pa.cpp +27 -0
  17. data/ext/tomoto/plda.cpp +29 -0
  18. data/ext/tomoto/slda.cpp +40 -0
  19. data/ext/tomoto/utils.h +84 -0
  20. data/lib/tomoto/tomoto.bundle +0 -0
  21. data/lib/tomoto/tomoto.so +0 -0
  22. data/lib/tomoto/version.rb +1 -1
  23. data/vendor/tomotopy/README.kr.rst +12 -3
  24. data/vendor/tomotopy/README.rst +12 -3
  25. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
  26. data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
  27. data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
  28. data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
  29. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
  30. data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
  31. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
  32. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
  33. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
  34. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
  35. data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
  36. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
  37. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
  38. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
  39. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
  40. data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
  41. data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
  42. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
  43. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
  44. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
  45. data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
  46. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
  47. data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
  48. data/vendor/tomotopy/src/Utils/math.h +8 -4
  49. data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
  50. metadata +24 -60
@@ -121,6 +121,7 @@ namespace tomoto
121
121
  };
122
122
 
123
123
  enum class ParallelScheme { default_, none, copy_merge, partition, size };
124
+ enum class GlobalSampler { train, freeze_topics, inference, size };
124
125
 
125
126
  inline const char* toString(ParallelScheme ps)
126
127
  {
@@ -236,7 +237,7 @@ namespace tomoto
236
237
  virtual const std::vector<uint64_t>& getVocabCf() const = 0;
237
238
  virtual const std::vector<uint64_t>& getVocabDf() const = 0;
238
239
 
239
- virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_) = 0;
240
+ virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_, bool freeze_topics = false) = 0;
240
241
  virtual size_t getGlobalStep() const = 0;
241
242
  virtual void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0) = 0;
242
243
 
@@ -588,7 +589,7 @@ namespace tomoto
588
589
  return ps;
589
590
  }
590
591
 
591
- int train(size_t iteration, size_t numWorkers, ParallelScheme ps) override
592
+ int train(size_t iteration, size_t numWorkers, ParallelScheme ps, bool freeze_topics = false) override
592
593
  {
593
594
  if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
594
595
  ps = getRealScheme(ps);
@@ -606,16 +607,20 @@ namespace tomoto
606
607
  localRG.emplace_back(rg());
607
608
  }
608
609
 
609
- for (size_t i = 0; i < numWorkers; ++i)
610
+ if (ps == ParallelScheme::copy_merge)
610
611
  {
611
- if(ps == ParallelScheme::copy_merge) localData.emplace_back(static_cast<_Derived*>(this)->globalState);
612
+ for (size_t i = 0; i < numWorkers; ++i)
613
+ {
614
+ localData.emplace_back(static_cast<_Derived*>(this)->globalState);
615
+ }
612
616
  }
613
-
614
- if (ps == ParallelScheme::partition)
617
+ else if (ps == ParallelScheme::partition)
615
618
  {
616
619
  localData.resize(numWorkers);
617
- static_cast<_Derived*>(this)->updatePartition(*cachedPool, globalState, localData.data(), docs.begin(), docs.end(),
618
- static_cast<_Derived*>(this)->eddTrain);
620
+ static_cast<_Derived*>(this)->updatePartition(
621
+ *cachedPool, globalState, localData.data(), docs.begin(), docs.end(),
622
+ static_cast<_Derived*>(this)->eddTrain
623
+ );
619
624
  }
620
625
 
621
626
  auto state = ps == ParallelScheme::none ? &globalState : localData.data();
@@ -629,15 +634,15 @@ namespace tomoto
629
634
  {
630
635
  case ParallelScheme::none:
631
636
  static_cast<_Derived*>(this)->template trainOne<ParallelScheme::none>(
632
- *cachedPool, state, localRG.data());
637
+ *cachedPool, state, localRG.data(), freeze_topics);
633
638
  break;
634
639
  case ParallelScheme::copy_merge:
635
640
  static_cast<_Derived*>(this)->template trainOne<ParallelScheme::copy_merge>(
636
- *cachedPool, state, localRG.data());
641
+ *cachedPool, state, localRG.data(), freeze_topics);
637
642
  break;
638
643
  case ParallelScheme::partition:
639
644
  static_cast<_Derived*>(this)->template trainOne<ParallelScheme::partition>(
640
- *cachedPool, state, localRG.data());
645
+ *cachedPool, state, localRG.data(), freeze_topics);
641
646
  break;
642
647
  }
643
648
  break;
@@ -51,8 +51,9 @@ namespace tomoto
51
51
  }
52
52
  if (len > 1) newDist.cov /= len - 1;
53
53
  }
54
- newDist.l = newDist.cov.llt().matrixL();
55
- newDist.logDet = newDist.l.diagonal().array().log().sum();
54
+ Eigen::MatrixXd l = newDist.cov.template cast<double>().llt().matrixL();
55
+ newDist.l = l.template cast<float>();
56
+ newDist.logDet = l.diagonal().array().log().sum();
56
57
  return newDist;
57
58
  }
58
59
 
@@ -26,10 +26,31 @@ namespace tomoto
26
26
  }
27
27
  };
28
28
 
29
+ template<class _Map, class _Node>
30
+ class TrieIterator : public _Map::const_iterator
31
+ {
32
+ using Base = typename _Map::const_iterator;
33
+ using Key = typename _Map::key_type;
34
+ const _Node* base = nullptr;
35
+ public:
36
+
37
+ TrieIterator(const Base& it, const _Node* _base)
38
+ : Base(it), base(_base)
39
+ {
40
+ }
41
+
42
+ std::pair<const Key, const _Node*> operator*() const
43
+ {
44
+ auto p = Base::operator*();
45
+ return std::make_pair(p.first, base + p.second);
46
+ }
47
+ };
48
+
29
49
  template<class _Key, class _Value, class _KeyStore = ConstAccess<std::map<_Key, int32_t>>, class _Trie = void>
30
50
  struct Trie
31
51
  {
32
52
  using Node = typename std::conditional<std::is_same<_Trie, void>::value, Trie, _Trie>::type;
53
+ using iterator = TrieIterator<_KeyStore, Node>;
33
54
  _KeyStore next = {};
34
55
  int32_t fail = 0;
35
56
  _Value val = {};
@@ -47,13 +68,23 @@ namespace tomoto
47
68
  return fail ? (Node*)this + fail : nullptr;
48
69
  }
49
70
 
71
+ iterator begin() const
72
+ {
73
+ return { next.begin(), (const Node*)this };
74
+ }
75
+
76
+ iterator end() const
77
+ {
78
+ return { next.end(), (const Node*)this };
79
+ }
80
+
50
81
  template<typename _TyIter, typename _FnAlloc>
51
- void build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
82
+ Node* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
52
83
  {
53
84
  if (first == last)
54
85
  {
55
86
  if (!val) val = _val;
56
- return;
87
+ return (Node*)this;
57
88
  }
58
89
 
59
90
  auto v = *first;
@@ -61,13 +92,13 @@ namespace tomoto
61
92
  {
62
93
  next[v] = alloc() - this;
63
94
  }
64
- getNext(v)->build(++first, last, _val, alloc);
95
+ return getNext(v)->build(++first, last, _val, alloc);
65
96
  }
66
97
 
67
98
  template<typename _TyIter>
68
99
  Node* findNode(_TyIter begin, _TyIter end)
69
100
  {
70
- if (begin == end) return this;
101
+ if (begin == end) return (Node*)this;
71
102
  auto n = getNext(*begin);
72
103
  if (n) return n->findNode(++begin, end);
73
104
  return nullptr;
@@ -173,21 +204,21 @@ namespace tomoto
173
204
  int32_t parent = 0;
174
205
 
175
206
  template<typename _TyIter, typename _FnAlloc>
176
- void build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
207
+ TrieEx* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
177
208
  {
178
209
  if (first == last)
179
210
  {
180
211
  if (!this->val) this->val = _val;
181
- return;
212
+ return this;
182
213
  }
183
214
 
184
215
  auto v = *first;
185
- if (!getNext(v))
216
+ if (!this->getNext(v))
186
217
  {
187
218
  this->next[v] = alloc() - this;
188
219
  this->getNext(v)->parent = -this->next[v];
189
220
  }
190
- this->getNext(v)->build(++first, last, _val, alloc);
221
+ return this->getNext(v)->build(++first, last, _val, alloc);
191
222
  }
192
223
 
193
224
  template<typename _FnAlloc>
@@ -14,57 +14,55 @@ namespace tomoto
14
14
  const Eigen::Matrix<_Ty, -1, 1>& lowerBound,
15
15
  const Eigen::Matrix<_Ty, -1, 1>& upperBound,
16
16
  _Rng& rng,
17
- size_t iteration)
17
+ size_t burnIn
18
+ )
18
19
  {
19
- constexpr _Ty epsilon = 1e-6;
20
20
  const size_t K = ret.size();
21
- Eigen::Matrix<_Ty, -1, 1> bias = Eigen::Matrix<_Ty, -1, 1>::Zero(K), lowers, uppers;
22
- auto& l = multiNormal.getCovL();
23
- ret.setZero();
24
-
25
- std::vector<size_t> ks(K);
26
- std::iota(ks.begin(), ks.end(), 0);
27
- for (size_t i = 0; i < iteration; ++i)
21
+ Eigen::Matrix<_Ty, -1, -1> l = multiNormal.getCovL();
22
+ ret = (lowerBound + upperBound) / 2;
23
+ Eigen::Matrix<_Ty, -1, 1> z = l.template triangularView<Eigen::Lower>().solve(ret - multiNormal.mean),
24
+ a = lowerBound - multiNormal.mean,
25
+ b = upperBound - multiNormal.mean,
26
+ t, at, bt;
27
+ for (size_t i = 0; i < burnIn; ++i)
28
28
  {
29
- // shuffle sampling orders except during initialization
30
- if (i) std::shuffle(ks.begin(), ks.end(), rng);
31
- for (size_t kx = 0; kx < K; ++kx)
29
+ for (size_t j = 0; j < K; ++j)
32
30
  {
33
- size_t k = ks[kx];
34
- ret[k] = 0;
35
- //bias = multiNormal.mean + l * ret;
36
- //bias.tail(K - k) = multiNormal.mean.tail(K - k) + l.block(k, 0, K - k, K) * ret;
37
- bias.tail(K - k) = multiNormal.mean.tail(K - k);
38
- bias.tail(K - k).noalias() += l.block(k, 0, K - k, K) * ret;
39
- lowers = (lowerBound - bias).tail(K - k).array() / l.col(k).tail(K - k).array();
40
- uppers = (upperBound - bias).tail(K - k).array() / l.col(k).tail(K - k).array();
41
- _Ty nLower = lowers[0], nUpper = uppers[0];
42
- if (l(k, k) < 0) std::swap(nLower, nUpper);
43
- if (i)
31
+ auto lj = l.col(j);
32
+ z[j] = 0;
33
+ t = l * z;
34
+ _Ty lower_pos = -INFINITY, upper_pos = INFINITY,
35
+ lower_neg = -INFINITY, upper_neg = INFINITY;
36
+ at = ((a - t).array() / lj.array()).matrix();
37
+ bt = ((b - t).array() / lj.array()).matrix();
38
+ for (size_t k = 0; k < K; ++k)
44
39
  {
45
- for (size_t j = 1; j < lowers.size(); ++j)
40
+ if (lj[k] > 0)
41
+ {
42
+ lower_pos = std::max(lower_pos, at[k]);
43
+ upper_pos = std::min(upper_pos, bt[k]);
44
+ }
45
+ else if (lj[k] < 0)
46
46
  {
47
- if (l.col(k)(j + k) > epsilon)
48
- {
49
- if (lowers[j] > nLower) nLower = lowers[j];
50
- if (uppers[j] < nUpper) nUpper = uppers[j];
51
- }
52
- else if (l.col(k)(j + k) < -epsilon)
53
- {
54
- if (uppers[j] > nLower) nLower = uppers[j];
55
- if (lowers[j] < nUpper) nUpper = lowers[j];
56
- }
47
+ lower_neg = std::max(lower_neg, bt[k]);
48
+ upper_neg = std::min(upper_neg, at[k]);
57
49
  }
58
50
  }
59
- if (abs(nLower - nUpper) <= 1e-4) ret[k] = (nLower + nUpper) / 2;
51
+ lower_pos = std::max(lower_pos, lower_neg);
52
+ upper_pos = std::min(upper_pos, upper_neg);
53
+ // this is due to numerical instability
54
+ if (lower_pos >= upper_pos)
55
+ {
56
+ std::cerr << __FILE__ << "(" << __LINE__ << "): wrong truncation range [" << lower_pos << ", " << upper_pos << "]" << std::endl;
57
+ z[j] = (lower_pos + upper_pos) / 2;
58
+ }
60
59
  else
61
60
  {
62
- ret[k] = rtnorm::rtnorm(rng, nLower, nUpper);
61
+ z[j] = rtnorm::rtnorm(rng, lower_pos, upper_pos);
63
62
  }
64
63
  }
65
64
  }
66
- ret = l * ret;
67
- ret += multiNormal.mean;
65
+ ret = (l * z) + multiNormal.mean;
68
66
  return ret;
69
67
  }
70
68
 
@@ -70,7 +70,7 @@ namespace tomoto
70
70
  }
71
71
 
72
72
  template<class UnaryFunction>
73
- UnaryFunction forRandom(size_t N, size_t seed, UnaryFunction f)
73
+ UnaryFunction forShuffled(size_t N, size_t seed, UnaryFunction f)
74
74
  {
75
75
  static size_t primes[16] = {
76
76
  65537, 65539, 65543, 65551, 65557, 65563,
@@ -206,132 +206,137 @@ namespace tomoto
206
206
  }
207
207
 
208
208
  template <typename _UnaryFunc, typename _Iterator>
209
- class TransformIter
209
+ class TransformIter : public _Iterator
210
210
  {
211
211
  private:
212
- _Iterator i;
213
212
  _UnaryFunc f;
214
213
  public:
215
214
  using reference = typename std::result_of<
216
215
  const _UnaryFunc(typename std::iterator_traits<_Iterator>::reference)
217
216
  >::type;
218
217
  using value_type = reference;
219
-
220
- using pointer = void;
221
- using iterator_category = typename std::iterator_traits<_Iterator>::iterator_category;
222
- using difference_type = typename std::iterator_traits<_Iterator>::difference_type;
223
-
218
+
224
219
  TransformIter(const _Iterator& _iter = {}, _UnaryFunc _f = {})
225
- : i(_iter), f(_f)
220
+ : _Iterator(_iter), f(_f)
226
221
  {}
227
222
 
228
223
  reference operator*()
229
224
  {
230
- return f(*i);
225
+ return f(_Iterator::operator*());
231
226
  }
232
227
 
233
228
  const reference operator*() const
234
229
  {
235
- return f(*i);
230
+ return f(_Iterator::operator*());
236
231
  }
237
232
 
238
233
  reference operator[](std::size_t idx)
239
234
  {
240
- return f(i[idx]);
235
+ return f(_Iterator::operator[](idx));
241
236
  }
242
237
 
243
238
  const reference operator[](std::size_t idx) const
244
239
  {
245
- return f(i[idx]);
240
+ return f(_Iterator::operator[](idx));
246
241
  }
247
242
 
248
243
  TransformIter& operator++()
249
244
  {
250
- ++i;
245
+ _Iterator::operator++();
251
246
  return *this;
252
247
  }
253
248
 
254
- TransformIter& operator++(int)
249
+ TransformIter operator++(int)
255
250
  {
256
251
  auto c = *this;
257
- ++i;
252
+ _Iterator::operator++();
258
253
  return c;
259
254
  }
260
255
 
261
256
  TransformIter& operator--()
262
257
  {
263
- --i;
258
+ _Iterator::operator--();
264
259
  return *this;
265
260
  }
266
261
 
267
- TransformIter& operator--(int)
262
+ TransformIter operator--(int)
268
263
  {
269
264
  auto c = *this;
270
- --i;
265
+ _Iterator::operator--();
271
266
  return c;
272
267
  }
273
268
 
274
269
  TransformIter operator+(int n) const
275
270
  {
276
- return { f, i + n };
271
+ return { _Iterator::operator+(n), f };
277
272
  }
278
273
 
279
274
  TransformIter operator-(int n) const
280
275
  {
281
- return { f, i - n };
276
+ return { _Iterator::operator-(n), f };
282
277
  }
283
278
 
284
279
  TransformIter& operator+=(int n)
285
280
  {
286
- i += n;
281
+ _Iterator::operator+=(n);
287
282
  return *this;
288
283
  }
289
284
 
290
285
  TransformIter& operator-=(int n)
291
286
  {
292
- i -= n;
287
+ _Iterator::operator-=(n);
293
288
  return *this;
294
289
  }
295
290
 
296
291
  typename std::iterator_traits<_Iterator>::difference_type operator-(const TransformIter& o) const
297
292
  {
298
- return i - o.i;
293
+ return (const _Iterator&)*this - (const _Iterator&)o;
299
294
  }
300
295
 
301
- bool operator==(const TransformIter& o) const
302
- {
303
- return i == o.i;
304
- }
296
+ };
305
297
 
306
- bool operator!=(const TransformIter& o) const
307
- {
308
- return i != o.i;
309
- }
298
+ template <typename _UnaryFunc, typename _Iterator>
299
+ TransformIter<_UnaryFunc, _Iterator> makeTransformIter(const _Iterator& iter, _UnaryFunc f)
300
+ {
301
+ return { iter, f };
302
+ }
310
303
 
311
- bool operator<(const TransformIter& o) const
304
+ template <typename _Iterator>
305
+ class StrideIter : public _Iterator
306
+ {
307
+ size_t stride;
308
+ const _Iterator end;
309
+ public:
310
+ StrideIter(const _Iterator& iter, size_t _stride = 1, const _Iterator& _end = {})
311
+ : _Iterator{ iter }, stride{ _stride }, end{ _end }
312
312
  {
313
- return i < o.i;
314
313
  }
315
314
 
316
- bool operator>(const TransformIter& o) const
317
- {
318
- return i > o.i;
319
- }
315
+ StrideIter(const StrideIter&) = default;
316
+ StrideIter(StrideIter&&) = default;
320
317
 
321
- bool operator<=(const TransformIter& o) const
318
+ StrideIter& operator++()
322
319
  {
323
- return i <= o.i;
320
+ for (size_t i = 0; i < stride && *this != end; ++i)
321
+ {
322
+ _Iterator::operator++();
323
+ }
324
+ return *this;
324
325
  }
325
326
 
326
- bool operator>=(const TransformIter& o) const
327
+ StrideIter& operator--()
327
328
  {
328
- return i >= o.i;
329
+ for (size_t i = 0; i < stride && *this != end; ++i)
330
+ {
331
+ _Iterator::operator--();
332
+ }
333
+ return *this;
329
334
  }
330
335
  };
331
336
 
332
- template <typename _UnaryFunc, typename _Iterator>
333
- TransformIter<_UnaryFunc, _Iterator> makeTransformIter(const _Iterator& iter, _UnaryFunc f)
337
+ template <typename _Iterator>
338
+ StrideIter<_Iterator> makeStrideIter(const _Iterator& iter, size_t stride, const _Iterator& end = {})
334
339
  {
335
- return { iter, f };
340
+ return { iter, stride, end };
336
341
  }
337
342
  }