tomoto 0.3.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (97) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +45 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +162 -0
  5. data/ext/tomoto/ct.cpp +58 -0
  6. data/ext/tomoto/dmr.cpp +69 -0
  7. data/ext/tomoto/dt.cpp +91 -0
  8. data/ext/tomoto/extconf.rb +34 -0
  9. data/ext/tomoto/gdmr.cpp +42 -0
  10. data/ext/tomoto/hdp.cpp +47 -0
  11. data/ext/tomoto/hlda.cpp +71 -0
  12. data/ext/tomoto/hpa.cpp +32 -0
  13. data/ext/tomoto/lda.cpp +281 -0
  14. data/ext/tomoto/llda.cpp +33 -0
  15. data/ext/tomoto/mglda.cpp +81 -0
  16. data/ext/tomoto/pa.cpp +32 -0
  17. data/ext/tomoto/plda.cpp +33 -0
  18. data/ext/tomoto/slda.cpp +48 -0
  19. data/ext/tomoto/tomoto.cpp +48 -0
  20. data/ext/tomoto/utils.h +30 -0
  21. data/lib/tomoto/2.7/tomoto.so +0 -0
  22. data/lib/tomoto/3.0/tomoto.so +0 -0
  23. data/lib/tomoto/3.1/tomoto.so +0 -0
  24. data/lib/tomoto/ct.rb +24 -0
  25. data/lib/tomoto/dmr.rb +27 -0
  26. data/lib/tomoto/dt.rb +15 -0
  27. data/lib/tomoto/gdmr.rb +15 -0
  28. data/lib/tomoto/hdp.rb +11 -0
  29. data/lib/tomoto/hlda.rb +56 -0
  30. data/lib/tomoto/hpa.rb +11 -0
  31. data/lib/tomoto/lda.rb +181 -0
  32. data/lib/tomoto/llda.rb +15 -0
  33. data/lib/tomoto/mglda.rb +15 -0
  34. data/lib/tomoto/pa.rb +11 -0
  35. data/lib/tomoto/plda.rb +15 -0
  36. data/lib/tomoto/slda.rb +37 -0
  37. data/lib/tomoto/version.rb +3 -0
  38. data/lib/tomoto.rb +27 -0
  39. data/vendor/EigenRand/EigenRand/EigenRand +24 -0
  40. data/vendor/EigenRand/LICENSE +21 -0
  41. data/vendor/EigenRand/README.md +426 -0
  42. data/vendor/eigen/COPYING.APACHE +203 -0
  43. data/vendor/eigen/COPYING.BSD +26 -0
  44. data/vendor/eigen/COPYING.GPL +674 -0
  45. data/vendor/eigen/COPYING.LGPL +502 -0
  46. data/vendor/eigen/COPYING.MINPACK +51 -0
  47. data/vendor/eigen/COPYING.MPL2 +373 -0
  48. data/vendor/eigen/COPYING.README +18 -0
  49. data/vendor/eigen/Eigen/Cholesky +45 -0
  50. data/vendor/eigen/Eigen/CholmodSupport +48 -0
  51. data/vendor/eigen/Eigen/Core +384 -0
  52. data/vendor/eigen/Eigen/Dense +7 -0
  53. data/vendor/eigen/Eigen/Eigen +2 -0
  54. data/vendor/eigen/Eigen/Eigenvalues +60 -0
  55. data/vendor/eigen/Eigen/Geometry +59 -0
  56. data/vendor/eigen/Eigen/Householder +29 -0
  57. data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
  58. data/vendor/eigen/Eigen/Jacobi +32 -0
  59. data/vendor/eigen/Eigen/KLUSupport +41 -0
  60. data/vendor/eigen/Eigen/LU +47 -0
  61. data/vendor/eigen/Eigen/MetisSupport +35 -0
  62. data/vendor/eigen/Eigen/OrderingMethods +70 -0
  63. data/vendor/eigen/Eigen/PaStiXSupport +49 -0
  64. data/vendor/eigen/Eigen/PardisoSupport +35 -0
  65. data/vendor/eigen/Eigen/QR +50 -0
  66. data/vendor/eigen/Eigen/QtAlignedMalloc +39 -0
  67. data/vendor/eigen/Eigen/SPQRSupport +34 -0
  68. data/vendor/eigen/Eigen/SVD +50 -0
  69. data/vendor/eigen/Eigen/Sparse +34 -0
  70. data/vendor/eigen/Eigen/SparseCholesky +37 -0
  71. data/vendor/eigen/Eigen/SparseCore +69 -0
  72. data/vendor/eigen/Eigen/SparseLU +50 -0
  73. data/vendor/eigen/Eigen/SparseQR +36 -0
  74. data/vendor/eigen/Eigen/StdDeque +27 -0
  75. data/vendor/eigen/Eigen/StdList +26 -0
  76. data/vendor/eigen/Eigen/StdVector +27 -0
  77. data/vendor/eigen/Eigen/SuperLUSupport +64 -0
  78. data/vendor/eigen/Eigen/UmfPackSupport +40 -0
  79. data/vendor/eigen/README.md +5 -0
  80. data/vendor/eigen/bench/README.txt +55 -0
  81. data/vendor/eigen/bench/btl/COPYING +340 -0
  82. data/vendor/eigen/bench/btl/README +154 -0
  83. data/vendor/eigen/bench/tensors/README +20 -0
  84. data/vendor/eigen/blas/README.txt +6 -0
  85. data/vendor/eigen/ci/README.md +56 -0
  86. data/vendor/eigen/demos/mandelbrot/README +10 -0
  87. data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
  88. data/vendor/eigen/demos/opengl/README +13 -0
  89. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1815 -0
  90. data/vendor/eigen/unsupported/README.txt +50 -0
  91. data/vendor/tomotopy/LICENSE +21 -0
  92. data/vendor/tomotopy/README.kr.rst +512 -0
  93. data/vendor/tomotopy/README.rst +516 -0
  94. data/vendor/variant/LICENSE +25 -0
  95. data/vendor/variant/LICENSE_1_0.txt +23 -0
  96. data/vendor/variant/README.md +102 -0
  97. metadata +140 -0
@@ -0,0 +1,56 @@
1
+ module Tomoto
2
+ class HLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, depth: 2, alpha: 0.1, eta: 0.01, gamma: 0.1, seed: nil)
4
+ model = _new(to_tw(tw), depth, alpha, eta, gamma, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ init_params(model, binding)
9
+ end
10
+
11
+ def children_topics(topic_id)
12
+ check_topic(topic_id)
13
+ _children_topics(topic_id)
14
+ end
15
+
16
+ def level(topic_id)
17
+ check_topic(topic_id)
18
+ _live_topic?(topic_id) ? _level(topic_id) : -1
19
+ end
20
+
21
+ def live_topic?(topic_id)
22
+ check_topic(topic_id)
23
+ _live_topic?(topic_id)
24
+ end
25
+
26
+ def num_docs_of_topic(topic_id)
27
+ check_topic(topic_id)
28
+ _num_docs_of_topic(topic_id)
29
+ end
30
+
31
+ def parent_topic(topic_id)
32
+ check_topic(topic_id)
33
+ _live_topic?(topic_id) ? _parent_topic(topic_id) : -1
34
+ end
35
+
36
+ private
37
+
38
+ def check_topic(topic_id)
39
+ raise "topic_id must be < K" if topic_id >= k
40
+ raise "train() should be called first" unless @prepared
41
+ end
42
+
43
+ def topics_info(summary, topic_word_top_n:)
44
+ counts = count_by_topics
45
+
46
+ nested_info = lambda do |k = 0, level = 0|
47
+ words = topic_words(k, top_n: topic_word_top_n).keys.join(" ")
48
+ summary << "| #{" " * level}##{k} (#{counts[k]}) : #{words}"
49
+ children_topics(k).sort.each do |c|
50
+ nested_info.call(c, level + 1)
51
+ end
52
+ end
53
+ nested_info.call
54
+ end
55
+ end
56
+ end
data/lib/tomoto/hpa.rb ADDED
@@ -0,0 +1,11 @@
1
+ module Tomoto
2
+ class HPA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k1: 1, k2: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), k1, k2, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ init_params(model, binding)
9
+ end
10
+ end
11
+ end
data/lib/tomoto/lda.rb ADDED
@@ -0,0 +1,181 @@
1
+ module Tomoto
2
+ class LDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), k, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ init_params(model, binding)
9
+ end
10
+
11
+ def self.load(filename)
12
+ model = new
13
+ model._load(filename)
14
+ model
15
+ end
16
+
17
+ def add_doc(doc)
18
+ _add_doc(prepare_doc(doc))
19
+ end
20
+
21
+ def make_doc(doc)
22
+ _make_doc(tokenize_doc(doc))
23
+ end
24
+
25
+ # TODO support multiple docs
26
+ def infer(doc, iter: 100, tolerance: -1, workers: 0, parallel: :default, together: 0)
27
+ raise "cannot infer with untrained model" unless defined?(@prepared)
28
+ _infer(doc, iter, tolerance, workers, to_ps(parallel), together)
29
+ end
30
+
31
+ def count_by_topics
32
+ prepare
33
+ _count_by_topics
34
+ end
35
+
36
+ def removed_top_words
37
+ prepare
38
+ _removed_top_words(@rm_top)
39
+ end
40
+
41
+ def save(filename, full: true)
42
+ _save(filename, full)
43
+ end
44
+
45
+ # returns string instead of printing
46
+ def summary(initial_hp: true, params: true, topic_word_top_n: 5)
47
+ summary = []
48
+
49
+ summary << "<Basic Info>"
50
+ basic_info(summary)
51
+ summary << "|"
52
+
53
+ summary << "<Training Info>"
54
+ training_info(summary)
55
+ summary << "|"
56
+
57
+ if initial_hp
58
+ summary << "<Initial Parameters>"
59
+ initial_params_info(summary)
60
+ summary << "|"
61
+ end
62
+
63
+ if params
64
+ summary << "<Parameters>"
65
+ params_info(summary)
66
+ summary << "|"
67
+ end
68
+
69
+ if topic_word_top_n > 0
70
+ summary << "<Topics>"
71
+ topics_info(summary, topic_word_top_n: topic_word_top_n)
72
+ summary << "|"
73
+ end
74
+
75
+ # skip ending |
76
+ summary.pop
77
+
78
+ summary.join("\n")
79
+ end
80
+
81
+ def topic_words(topic_id = nil, top_n: 10)
82
+ if topic_id
83
+ _topic_words(topic_id, top_n)
84
+ else
85
+ k.times.map { |i| _topic_words(i, top_n) }
86
+ end
87
+ end
88
+
89
+ def train(iterations = 10, workers: 0, parallel: :default)
90
+ prepare
91
+ _train(iterations, workers, to_ps(parallel))
92
+ end
93
+
94
+ def tw
95
+ TERM_WEIGHT[_tw]
96
+ end
97
+
98
+ private
99
+
100
+ def prepare
101
+ unless defined?(@prepared)
102
+ _prepare(@min_cf, @min_df, @rm_top)
103
+ @prepared = true
104
+ end
105
+ end
106
+
107
+ def prepare_doc(doc)
108
+ raise "cannot add_doc() after train()" if defined?(@prepared)
109
+ tokenize_doc(doc)
110
+ end
111
+
112
+ def tokenize_doc(doc)
113
+ doc = doc.split(/[[:space:]]+/) unless doc.is_a?(Array)
114
+ doc
115
+ end
116
+
117
+ def basic_info(summary)
118
+ sum = used_vocab_freq.sum.to_f
119
+ mapped = used_vocab_freq.map { |v| v / sum }
120
+ entropy = mapped.map { |v| v * Math.log(v) }.sum
121
+
122
+ summary << "| #{self.class.name.sub("Tomoto::", "")} (current version: #{VERSION})"
123
+ summary << "| #{num_docs} docs, #{num_words} words"
124
+ summary << "| Total Vocabs: #{vocabs.size}, Used Vocabs: #{used_vocabs.size}"
125
+ summary << "| Entropy of words: %.5f" % entropy
126
+ summary << "| Removed Vocabs: #{removed_top_words.any? ? removed_top_words.join(" ") : "<NA>"}"
127
+ end
128
+
129
+ def training_info(summary)
130
+ summary << "| Iterations: #{global_step}, Burn-in steps: #{burn_in}"
131
+ summary << "| Optimization Interval: #{optim_interval}"
132
+ summary << "| Log-likelihood per word: %.5f" % ll_per_word
133
+ end
134
+
135
+ def initial_params_info(summary)
136
+ if defined?(@init_params)
137
+ @init_params.each do |k, v|
138
+ summary << "| #{k}: #{v}"
139
+ end
140
+ else
141
+ summary << "| Not Available"
142
+ end
143
+ end
144
+
145
+ def params_info(summary)
146
+ summary << "| alpha (Dirichlet prior on the per-document topic distributions)"
147
+ summary << "| #{alpha}"
148
+ summary << "| eta (Dirichlet prior on the per-topic word distribution)"
149
+ summary << "| %.5f" % eta
150
+ end
151
+
152
+ def topics_info(summary, topic_word_top_n:)
153
+ counts = count_by_topics
154
+ topic_words(top_n: topic_word_top_n).each_with_index do |words, i|
155
+ summary << "| ##{i} (#{counts[i]}) : #{words.keys.join(" ")}"
156
+ end
157
+ end
158
+
159
+ def to_ps(ps)
160
+ PARALLEL_SCHEME.index(ps) || (raise ArgumentError, "Invalid parallel scheme: #{ps}")
161
+ end
162
+
163
+ class << self
164
+ private
165
+
166
+ def to_tw(tw)
167
+ TERM_WEIGHT.index(tw) || (raise ArgumentError, "Invalid tw: #{tw}")
168
+ end
169
+
170
+ def init_params(model, binding)
171
+ init_params = {}
172
+ method(:new).parameters.each do |v|
173
+ next if v[0] != :key
174
+ init_params[v[1]] = binding.local_variable_get(v[1]).inspect
175
+ end
176
+ model.instance_variable_set(:@init_params, init_params)
177
+ model
178
+ end
179
+ end
180
+ end
181
+ end
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class LLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), k, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ init_params(model, binding)
9
+ end
10
+
11
+ def add_doc(doc, labels: [])
12
+ _add_doc(prepare_doc(doc), labels)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class MGLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k_g: 1, k_l: 1, t: 3, alpha_g: 0.1, alpha_l: 0.1, alpha_mg: 0.1, alpha_ml: 0.1, eta_g: 0.01) #, eta_l: 0.01, gamma: 0.1, seed: nil)
4
+ model = _new(to_tw(tw), k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ init_params(model, binding)
9
+ end
10
+
11
+ def add_doc(doc, delimiter: ".")
12
+ _add_doc(prepare_doc(doc), delimiter)
13
+ end
14
+ end
15
+ end
data/lib/tomoto/pa.rb ADDED
@@ -0,0 +1,11 @@
1
+ module Tomoto
2
+ class PA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k1: 1, k2: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), k1, k2, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ init_params(model, binding)
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class PLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, latent_topics: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), latent_topics, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ init_params(model, binding)
9
+ end
10
+
11
+ def add_doc(doc, labels: [])
12
+ _add_doc(prepare_doc(doc), labels)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,37 @@
1
+ module Tomoto
2
+ class SLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, vars: "", alpha: 0.1, eta: 0.01, mu: [], nu_sq: [], glm_param: [], seed: nil)
4
+ model = _new(to_tw(tw), k, vars.split("").map { |v| to_glm(v) }, alpha, eta, mu, nu_sq, glm_param, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ init_params(model, binding)
9
+ end
10
+
11
+ def add_doc(doc, y: [])
12
+ _add_doc(prepare_doc(doc), y)
13
+ end
14
+
15
+ def var_type(var_id)
16
+ raise "train() should be called first" unless @prepared
17
+ _var_type(var_id)
18
+ end
19
+
20
+ private
21
+
22
+ class << self
23
+ private
24
+
25
+ def to_glm(v)
26
+ case v
27
+ when "l"
28
+ 0
29
+ when "b"
30
+ 1
31
+ else
32
+ raise "Invalid var: #{v}"
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,3 @@
1
+ module Tomoto
2
+ VERSION = "0.3.0"
3
+ end
data/lib/tomoto.rb ADDED
@@ -0,0 +1,27 @@
1
+ # ext
2
+ begin
3
+ require "tomoto/#{RUBY_VERSION.to_f}/tomoto"
4
+ rescue LoadError
5
+ require "tomoto/tomoto"
6
+ end
7
+
8
+ # modules
9
+ require "tomoto/ct"
10
+ require "tomoto/dmr"
11
+ require "tomoto/dt"
12
+ require "tomoto/gdmr"
13
+ require "tomoto/hdp"
14
+ require "tomoto/hlda"
15
+ require "tomoto/hpa"
16
+ require "tomoto/lda"
17
+ require "tomoto/llda"
18
+ require "tomoto/mglda"
19
+ require "tomoto/pa"
20
+ require "tomoto/plda"
21
+ require "tomoto/slda"
22
+ require "tomoto/version"
23
+
24
+ module Tomoto
25
+ PARALLEL_SCHEME = [:default, :none, :copy_merge, :partition]
26
+ TERM_WEIGHT = [:one, :idf, :pmi]
27
+ end
@@ -0,0 +1,24 @@
1
+ /**
2
+ * @file EigenRand
3
+ * @author bab2min (bab2min@gmail.com)
4
+ * @brief
5
+ * @version 0.4.0
6
+ * @date 2021-09-17
7
+ *
8
+ * @copyright Copyright (c) 2020-2021
9
+ *
10
+ */
11
+
12
+ #ifndef EIGENRAND_EIGENRAND_H
13
+ #define EIGENRAND_EIGENRAND_H
14
+
15
+ #include <Eigen/Dense>
16
+
17
+ #include <Eigen/src/Core/util/DisableStupidWarnings.h>
18
+
19
+ #include "Macro.h"
20
+ #include "Core.h"
21
+
22
+ #include <Eigen/src/Core/util/ReenableStupidWarnings.h>
23
+
24
+ #endif
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020, bab2min
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.