tomoto 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +7 -0
  5. data/ext/tomoto/ct.cpp +54 -0
  6. data/ext/tomoto/dmr.cpp +62 -0
  7. data/ext/tomoto/dt.cpp +82 -0
  8. data/ext/tomoto/ext.cpp +27 -773
  9. data/ext/tomoto/gdmr.cpp +34 -0
  10. data/ext/tomoto/hdp.cpp +42 -0
  11. data/ext/tomoto/hlda.cpp +66 -0
  12. data/ext/tomoto/hpa.cpp +27 -0
  13. data/ext/tomoto/lda.cpp +250 -0
  14. data/ext/tomoto/llda.cpp +29 -0
  15. data/ext/tomoto/mglda.cpp +71 -0
  16. data/ext/tomoto/pa.cpp +27 -0
  17. data/ext/tomoto/plda.cpp +29 -0
  18. data/ext/tomoto/slda.cpp +40 -0
  19. data/ext/tomoto/utils.h +84 -0
  20. data/lib/tomoto/tomoto.bundle +0 -0
  21. data/lib/tomoto/tomoto.so +0 -0
  22. data/lib/tomoto/version.rb +1 -1
  23. data/vendor/tomotopy/README.kr.rst +12 -3
  24. data/vendor/tomotopy/README.rst +12 -3
  25. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
  26. data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
  27. data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
  28. data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
  29. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
  30. data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
  31. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
  32. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
  33. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
  34. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
  35. data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
  36. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
  37. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
  38. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
  39. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
  40. data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
  41. data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
  42. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
  43. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
  44. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
  45. data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
  46. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
  47. data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
  48. data/vendor/tomotopy/src/Utils/math.h +8 -4
  49. data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
  50. metadata +24 -60
@@ -121,6 +121,7 @@ namespace tomoto
121
121
  };
122
122
 
123
123
  enum class ParallelScheme { default_, none, copy_merge, partition, size };
124
+ enum class GlobalSampler { train, freeze_topics, inference, size };
124
125
 
125
126
  inline const char* toString(ParallelScheme ps)
126
127
  {
@@ -236,7 +237,7 @@ namespace tomoto
236
237
  virtual const std::vector<uint64_t>& getVocabCf() const = 0;
237
238
  virtual const std::vector<uint64_t>& getVocabDf() const = 0;
238
239
 
239
- virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_) = 0;
240
+ virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_, bool freeze_topics = false) = 0;
240
241
  virtual size_t getGlobalStep() const = 0;
241
242
  virtual void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0) = 0;
242
243
 
@@ -588,7 +589,7 @@ namespace tomoto
588
589
  return ps;
589
590
  }
590
591
 
591
- int train(size_t iteration, size_t numWorkers, ParallelScheme ps) override
592
+ int train(size_t iteration, size_t numWorkers, ParallelScheme ps, bool freeze_topics = false) override
592
593
  {
593
594
  if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
594
595
  ps = getRealScheme(ps);
@@ -606,16 +607,20 @@ namespace tomoto
606
607
  localRG.emplace_back(rg());
607
608
  }
608
609
 
609
- for (size_t i = 0; i < numWorkers; ++i)
610
+ if (ps == ParallelScheme::copy_merge)
610
611
  {
611
- if(ps == ParallelScheme::copy_merge) localData.emplace_back(static_cast<_Derived*>(this)->globalState);
612
+ for (size_t i = 0; i < numWorkers; ++i)
613
+ {
614
+ localData.emplace_back(static_cast<_Derived*>(this)->globalState);
615
+ }
612
616
  }
613
-
614
- if (ps == ParallelScheme::partition)
617
+ else if (ps == ParallelScheme::partition)
615
618
  {
616
619
  localData.resize(numWorkers);
617
- static_cast<_Derived*>(this)->updatePartition(*cachedPool, globalState, localData.data(), docs.begin(), docs.end(),
618
- static_cast<_Derived*>(this)->eddTrain);
620
+ static_cast<_Derived*>(this)->updatePartition(
621
+ *cachedPool, globalState, localData.data(), docs.begin(), docs.end(),
622
+ static_cast<_Derived*>(this)->eddTrain
623
+ );
619
624
  }
620
625
 
621
626
  auto state = ps == ParallelScheme::none ? &globalState : localData.data();
@@ -629,15 +634,15 @@ namespace tomoto
629
634
  {
630
635
  case ParallelScheme::none:
631
636
  static_cast<_Derived*>(this)->template trainOne<ParallelScheme::none>(
632
- *cachedPool, state, localRG.data());
637
+ *cachedPool, state, localRG.data(), freeze_topics);
633
638
  break;
634
639
  case ParallelScheme::copy_merge:
635
640
  static_cast<_Derived*>(this)->template trainOne<ParallelScheme::copy_merge>(
636
- *cachedPool, state, localRG.data());
641
+ *cachedPool, state, localRG.data(), freeze_topics);
637
642
  break;
638
643
  case ParallelScheme::partition:
639
644
  static_cast<_Derived*>(this)->template trainOne<ParallelScheme::partition>(
640
- *cachedPool, state, localRG.data());
645
+ *cachedPool, state, localRG.data(), freeze_topics);
641
646
  break;
642
647
  }
643
648
  break;
@@ -51,8 +51,9 @@ namespace tomoto
51
51
  }
52
52
  if (len > 1) newDist.cov /= len - 1;
53
53
  }
54
- newDist.l = newDist.cov.llt().matrixL();
55
- newDist.logDet = newDist.l.diagonal().array().log().sum();
54
+ Eigen::MatrixXd l = newDist.cov.template cast<double>().llt().matrixL();
55
+ newDist.l = l.template cast<float>();
56
+ newDist.logDet = l.diagonal().array().log().sum();
56
57
  return newDist;
57
58
  }
58
59
 
@@ -26,10 +26,31 @@ namespace tomoto
26
26
  }
27
27
  };
28
28
 
29
+ template<class _Map, class _Node>
30
+ class TrieIterator : public _Map::const_iterator
31
+ {
32
+ using Base = typename _Map::const_iterator;
33
+ using Key = typename _Map::key_type;
34
+ const _Node* base = nullptr;
35
+ public:
36
+
37
+ TrieIterator(const Base& it, const _Node* _base)
38
+ : Base(it), base(_base)
39
+ {
40
+ }
41
+
42
+ std::pair<const Key, const _Node*> operator*() const
43
+ {
44
+ auto p = Base::operator*();
45
+ return std::make_pair(p.first, base + p.second);
46
+ }
47
+ };
48
+
29
49
  template<class _Key, class _Value, class _KeyStore = ConstAccess<std::map<_Key, int32_t>>, class _Trie = void>
30
50
  struct Trie
31
51
  {
32
52
  using Node = typename std::conditional<std::is_same<_Trie, void>::value, Trie, _Trie>::type;
53
+ using iterator = TrieIterator<_KeyStore, Node>;
33
54
  _KeyStore next = {};
34
55
  int32_t fail = 0;
35
56
  _Value val = {};
@@ -47,13 +68,23 @@ namespace tomoto
47
68
  return fail ? (Node*)this + fail : nullptr;
48
69
  }
49
70
 
71
+ iterator begin() const
72
+ {
73
+ return { next.begin(), (const Node*)this };
74
+ }
75
+
76
+ iterator end() const
77
+ {
78
+ return { next.end(), (const Node*)this };
79
+ }
80
+
50
81
  template<typename _TyIter, typename _FnAlloc>
51
- void build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
82
+ Node* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
52
83
  {
53
84
  if (first == last)
54
85
  {
55
86
  if (!val) val = _val;
56
- return;
87
+ return (Node*)this;
57
88
  }
58
89
 
59
90
  auto v = *first;
@@ -61,13 +92,13 @@ namespace tomoto
61
92
  {
62
93
  next[v] = alloc() - this;
63
94
  }
64
- getNext(v)->build(++first, last, _val, alloc);
95
+ return getNext(v)->build(++first, last, _val, alloc);
65
96
  }
66
97
 
67
98
  template<typename _TyIter>
68
99
  Node* findNode(_TyIter begin, _TyIter end)
69
100
  {
70
- if (begin == end) return this;
101
+ if (begin == end) return (Node*)this;
71
102
  auto n = getNext(*begin);
72
103
  if (n) return n->findNode(++begin, end);
73
104
  return nullptr;
@@ -173,21 +204,21 @@ namespace tomoto
173
204
  int32_t parent = 0;
174
205
 
175
206
  template<typename _TyIter, typename _FnAlloc>
176
- void build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
207
+ TrieEx* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc)
177
208
  {
178
209
  if (first == last)
179
210
  {
180
211
  if (!this->val) this->val = _val;
181
- return;
212
+ return this;
182
213
  }
183
214
 
184
215
  auto v = *first;
185
- if (!getNext(v))
216
+ if (!this->getNext(v))
186
217
  {
187
218
  this->next[v] = alloc() - this;
188
219
  this->getNext(v)->parent = -this->next[v];
189
220
  }
190
- this->getNext(v)->build(++first, last, _val, alloc);
221
+ return this->getNext(v)->build(++first, last, _val, alloc);
191
222
  }
192
223
 
193
224
  template<typename _FnAlloc>
@@ -14,57 +14,55 @@ namespace tomoto
14
14
  const Eigen::Matrix<_Ty, -1, 1>& lowerBound,
15
15
  const Eigen::Matrix<_Ty, -1, 1>& upperBound,
16
16
  _Rng& rng,
17
- size_t iteration)
17
+ size_t burnIn
18
+ )
18
19
  {
19
- constexpr _Ty epsilon = 1e-6;
20
20
  const size_t K = ret.size();
21
- Eigen::Matrix<_Ty, -1, 1> bias = Eigen::Matrix<_Ty, -1, 1>::Zero(K), lowers, uppers;
22
- auto& l = multiNormal.getCovL();
23
- ret.setZero();
24
-
25
- std::vector<size_t> ks(K);
26
- std::iota(ks.begin(), ks.end(), 0);
27
- for (size_t i = 0; i < iteration; ++i)
21
+ Eigen::Matrix<_Ty, -1, -1> l = multiNormal.getCovL();
22
+ ret = (lowerBound + upperBound) / 2;
23
+ Eigen::Matrix<_Ty, -1, 1> z = l.template triangularView<Eigen::Lower>().solve(ret - multiNormal.mean),
24
+ a = lowerBound - multiNormal.mean,
25
+ b = upperBound - multiNormal.mean,
26
+ t, at, bt;
27
+ for (size_t i = 0; i < burnIn; ++i)
28
28
  {
29
- // shuffle sampling orders except during initialization
30
- if (i) std::shuffle(ks.begin(), ks.end(), rng);
31
- for (size_t kx = 0; kx < K; ++kx)
29
+ for (size_t j = 0; j < K; ++j)
32
30
  {
33
- size_t k = ks[kx];
34
- ret[k] = 0;
35
- //bias = multiNormal.mean + l * ret;
36
- //bias.tail(K - k) = multiNormal.mean.tail(K - k) + l.block(k, 0, K - k, K) * ret;
37
- bias.tail(K - k) = multiNormal.mean.tail(K - k);
38
- bias.tail(K - k).noalias() += l.block(k, 0, K - k, K) * ret;
39
- lowers = (lowerBound - bias).tail(K - k).array() / l.col(k).tail(K - k).array();
40
- uppers = (upperBound - bias).tail(K - k).array() / l.col(k).tail(K - k).array();
41
- _Ty nLower = lowers[0], nUpper = uppers[0];
42
- if (l(k, k) < 0) std::swap(nLower, nUpper);
43
- if (i)
31
+ auto lj = l.col(j);
32
+ z[j] = 0;
33
+ t = l * z;
34
+ _Ty lower_pos = -INFINITY, upper_pos = INFINITY,
35
+ lower_neg = -INFINITY, upper_neg = INFINITY;
36
+ at = ((a - t).array() / lj.array()).matrix();
37
+ bt = ((b - t).array() / lj.array()).matrix();
38
+ for (size_t k = 0; k < K; ++k)
44
39
  {
45
- for (size_t j = 1; j < lowers.size(); ++j)
40
+ if (lj[k] > 0)
41
+ {
42
+ lower_pos = std::max(lower_pos, at[k]);
43
+ upper_pos = std::min(upper_pos, bt[k]);
44
+ }
45
+ else if (lj[k] < 0)
46
46
  {
47
- if (l.col(k)(j + k) > epsilon)
48
- {
49
- if (lowers[j] > nLower) nLower = lowers[j];
50
- if (uppers[j] < nUpper) nUpper = uppers[j];
51
- }
52
- else if (l.col(k)(j + k) < -epsilon)
53
- {
54
- if (uppers[j] > nLower) nLower = uppers[j];
55
- if (lowers[j] < nUpper) nUpper = lowers[j];
56
- }
47
+ lower_neg = std::max(lower_neg, bt[k]);
48
+ upper_neg = std::min(upper_neg, at[k]);
57
49
  }
58
50
  }
59
- if (abs(nLower - nUpper) <= 1e-4) ret[k] = (nLower + nUpper) / 2;
51
+ lower_pos = std::max(lower_pos, lower_neg);
52
+ upper_pos = std::min(upper_pos, upper_neg);
53
+ // this is due to numerical instability
54
+ if (lower_pos >= upper_pos)
55
+ {
56
+ std::cerr << __FILE__ << "(" << __LINE__ << "): wrong truncation range [" << lower_pos << ", " << upper_pos << "]" << std::endl;
57
+ z[j] = (lower_pos + upper_pos) / 2;
58
+ }
60
59
  else
61
60
  {
62
- ret[k] = rtnorm::rtnorm(rng, nLower, nUpper);
61
+ z[j] = rtnorm::rtnorm(rng, lower_pos, upper_pos);
63
62
  }
64
63
  }
65
64
  }
66
- ret = l * ret;
67
- ret += multiNormal.mean;
65
+ ret = (l * z) + multiNormal.mean;
68
66
  return ret;
69
67
  }
70
68
 
@@ -70,7 +70,7 @@ namespace tomoto
70
70
  }
71
71
 
72
72
  template<class UnaryFunction>
73
- UnaryFunction forRandom(size_t N, size_t seed, UnaryFunction f)
73
+ UnaryFunction forShuffled(size_t N, size_t seed, UnaryFunction f)
74
74
  {
75
75
  static size_t primes[16] = {
76
76
  65537, 65539, 65543, 65551, 65557, 65563,
@@ -206,132 +206,137 @@ namespace tomoto
206
206
  }
207
207
 
208
208
  template <typename _UnaryFunc, typename _Iterator>
209
- class TransformIter
209
+ class TransformIter : public _Iterator
210
210
  {
211
211
  private:
212
- _Iterator i;
213
212
  _UnaryFunc f;
214
213
  public:
215
214
  using reference = typename std::result_of<
216
215
  const _UnaryFunc(typename std::iterator_traits<_Iterator>::reference)
217
216
  >::type;
218
217
  using value_type = reference;
219
-
220
- using pointer = void;
221
- using iterator_category = typename std::iterator_traits<_Iterator>::iterator_category;
222
- using difference_type = typename std::iterator_traits<_Iterator>::difference_type;
223
-
218
+
224
219
  TransformIter(const _Iterator& _iter = {}, _UnaryFunc _f = {})
225
- : i(_iter), f(_f)
220
+ : _Iterator(_iter), f(_f)
226
221
  {}
227
222
 
228
223
  reference operator*()
229
224
  {
230
- return f(*i);
225
+ return f(_Iterator::operator*());
231
226
  }
232
227
 
233
228
  const reference operator*() const
234
229
  {
235
- return f(*i);
230
+ return f(_Iterator::operator*());
236
231
  }
237
232
 
238
233
  reference operator[](std::size_t idx)
239
234
  {
240
- return f(i[idx]);
235
+ return f(_Iterator::operator[](idx));
241
236
  }
242
237
 
243
238
  const reference operator[](std::size_t idx) const
244
239
  {
245
- return f(i[idx]);
240
+ return f(_Iterator::operator[](idx));
246
241
  }
247
242
 
248
243
  TransformIter& operator++()
249
244
  {
250
- ++i;
245
+ _Iterator::operator++();
251
246
  return *this;
252
247
  }
253
248
 
254
- TransformIter& operator++(int)
249
+ TransformIter operator++(int)
255
250
  {
256
251
  auto c = *this;
257
- ++i;
252
+ _Iterator::operator++();
258
253
  return c;
259
254
  }
260
255
 
261
256
  TransformIter& operator--()
262
257
  {
263
- --i;
258
+ _Iterator::operator--();
264
259
  return *this;
265
260
  }
266
261
 
267
- TransformIter& operator--(int)
262
+ TransformIter operator--(int)
268
263
  {
269
264
  auto c = *this;
270
- --i;
265
+ _Iterator::operator--();
271
266
  return c;
272
267
  }
273
268
 
274
269
  TransformIter operator+(int n) const
275
270
  {
276
- return { f, i + n };
271
+ return { _Iterator::operator+(n), f };
277
272
  }
278
273
 
279
274
  TransformIter operator-(int n) const
280
275
  {
281
- return { f, i - n };
276
+ return { _Iterator::operator-(n), f };
282
277
  }
283
278
 
284
279
  TransformIter& operator+=(int n)
285
280
  {
286
- i += n;
281
+ _Iterator::operator+=(n);
287
282
  return *this;
288
283
  }
289
284
 
290
285
  TransformIter& operator-=(int n)
291
286
  {
292
- i -= n;
287
+ _Iterator::operator-=(n);
293
288
  return *this;
294
289
  }
295
290
 
296
291
  typename std::iterator_traits<_Iterator>::difference_type operator-(const TransformIter& o) const
297
292
  {
298
- return i - o.i;
293
+ return (const _Iterator&)*this - (const _Iterator&)o;
299
294
  }
300
295
 
301
- bool operator==(const TransformIter& o) const
302
- {
303
- return i == o.i;
304
- }
296
+ };
305
297
 
306
- bool operator!=(const TransformIter& o) const
307
- {
308
- return i != o.i;
309
- }
298
+ template <typename _UnaryFunc, typename _Iterator>
299
+ TransformIter<_UnaryFunc, _Iterator> makeTransformIter(const _Iterator& iter, _UnaryFunc f)
300
+ {
301
+ return { iter, f };
302
+ }
310
303
 
311
- bool operator<(const TransformIter& o) const
304
+ template <typename _Iterator>
305
+ class StrideIter : public _Iterator
306
+ {
307
+ size_t stride;
308
+ const _Iterator end;
309
+ public:
310
+ StrideIter(const _Iterator& iter, size_t _stride = 1, const _Iterator& _end = {})
311
+ : _Iterator{ iter }, stride{ _stride }, end{ _end }
312
312
  {
313
- return i < o.i;
314
313
  }
315
314
 
316
- bool operator>(const TransformIter& o) const
317
- {
318
- return i > o.i;
319
- }
315
+ StrideIter(const StrideIter&) = default;
316
+ StrideIter(StrideIter&&) = default;
320
317
 
321
- bool operator<=(const TransformIter& o) const
318
+ StrideIter& operator++()
322
319
  {
323
- return i <= o.i;
320
+ for (size_t i = 0; i < stride && *this != end; ++i)
321
+ {
322
+ _Iterator::operator++();
323
+ }
324
+ return *this;
324
325
  }
325
326
 
326
- bool operator>=(const TransformIter& o) const
327
+ StrideIter& operator--()
327
328
  {
328
- return i >= o.i;
329
+ for (size_t i = 0; i < stride && *this != end; ++i)
330
+ {
331
+ _Iterator::operator--();
332
+ }
333
+ return *this;
329
334
  }
330
335
  };
331
336
 
332
- template <typename _UnaryFunc, typename _Iterator>
333
- TransformIter<_UnaryFunc, _Iterator> makeTransformIter(const _Iterator& iter, _UnaryFunc f)
337
+ template <typename _Iterator>
338
+ StrideIter<_Iterator> makeStrideIter(const _Iterator& iter, size_t stride, const _Iterator& end = {})
334
339
  {
335
- return { iter, f };
340
+ return { iter, stride, end };
336
341
  }
337
342
  }