tomoto 0.3.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +45 -0
- data/LICENSE.txt +22 -0
- data/README.md +162 -0
- data/ext/tomoto/ct.cpp +58 -0
- data/ext/tomoto/dmr.cpp +69 -0
- data/ext/tomoto/dt.cpp +91 -0
- data/ext/tomoto/extconf.rb +34 -0
- data/ext/tomoto/gdmr.cpp +42 -0
- data/ext/tomoto/hdp.cpp +47 -0
- data/ext/tomoto/hlda.cpp +71 -0
- data/ext/tomoto/hpa.cpp +32 -0
- data/ext/tomoto/lda.cpp +281 -0
- data/ext/tomoto/llda.cpp +33 -0
- data/ext/tomoto/mglda.cpp +81 -0
- data/ext/tomoto/pa.cpp +32 -0
- data/ext/tomoto/plda.cpp +33 -0
- data/ext/tomoto/slda.cpp +48 -0
- data/ext/tomoto/tomoto.cpp +48 -0
- data/ext/tomoto/utils.h +30 -0
- data/lib/tomoto/2.7/tomoto.so +0 -0
- data/lib/tomoto/3.0/tomoto.so +0 -0
- data/lib/tomoto/3.1/tomoto.so +0 -0
- data/lib/tomoto/ct.rb +24 -0
- data/lib/tomoto/dmr.rb +27 -0
- data/lib/tomoto/dt.rb +15 -0
- data/lib/tomoto/gdmr.rb +15 -0
- data/lib/tomoto/hdp.rb +11 -0
- data/lib/tomoto/hlda.rb +56 -0
- data/lib/tomoto/hpa.rb +11 -0
- data/lib/tomoto/lda.rb +181 -0
- data/lib/tomoto/llda.rb +15 -0
- data/lib/tomoto/mglda.rb +15 -0
- data/lib/tomoto/pa.rb +11 -0
- data/lib/tomoto/plda.rb +15 -0
- data/lib/tomoto/slda.rb +37 -0
- data/lib/tomoto/version.rb +3 -0
- data/lib/tomoto.rb +27 -0
- data/vendor/EigenRand/EigenRand/EigenRand +24 -0
- data/vendor/EigenRand/LICENSE +21 -0
- data/vendor/EigenRand/README.md +426 -0
- data/vendor/eigen/COPYING.APACHE +203 -0
- data/vendor/eigen/COPYING.BSD +26 -0
- data/vendor/eigen/COPYING.GPL +674 -0
- data/vendor/eigen/COPYING.LGPL +502 -0
- data/vendor/eigen/COPYING.MINPACK +51 -0
- data/vendor/eigen/COPYING.MPL2 +373 -0
- data/vendor/eigen/COPYING.README +18 -0
- data/vendor/eigen/Eigen/Cholesky +45 -0
- data/vendor/eigen/Eigen/CholmodSupport +48 -0
- data/vendor/eigen/Eigen/Core +384 -0
- data/vendor/eigen/Eigen/Dense +7 -0
- data/vendor/eigen/Eigen/Eigen +2 -0
- data/vendor/eigen/Eigen/Eigenvalues +60 -0
- data/vendor/eigen/Eigen/Geometry +59 -0
- data/vendor/eigen/Eigen/Householder +29 -0
- data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
- data/vendor/eigen/Eigen/Jacobi +32 -0
- data/vendor/eigen/Eigen/KLUSupport +41 -0
- data/vendor/eigen/Eigen/LU +47 -0
- data/vendor/eigen/Eigen/MetisSupport +35 -0
- data/vendor/eigen/Eigen/OrderingMethods +70 -0
- data/vendor/eigen/Eigen/PaStiXSupport +49 -0
- data/vendor/eigen/Eigen/PardisoSupport +35 -0
- data/vendor/eigen/Eigen/QR +50 -0
- data/vendor/eigen/Eigen/QtAlignedMalloc +39 -0
- data/vendor/eigen/Eigen/SPQRSupport +34 -0
- data/vendor/eigen/Eigen/SVD +50 -0
- data/vendor/eigen/Eigen/Sparse +34 -0
- data/vendor/eigen/Eigen/SparseCholesky +37 -0
- data/vendor/eigen/Eigen/SparseCore +69 -0
- data/vendor/eigen/Eigen/SparseLU +50 -0
- data/vendor/eigen/Eigen/SparseQR +36 -0
- data/vendor/eigen/Eigen/StdDeque +27 -0
- data/vendor/eigen/Eigen/StdList +26 -0
- data/vendor/eigen/Eigen/StdVector +27 -0
- data/vendor/eigen/Eigen/SuperLUSupport +64 -0
- data/vendor/eigen/Eigen/UmfPackSupport +40 -0
- data/vendor/eigen/README.md +5 -0
- data/vendor/eigen/bench/README.txt +55 -0
- data/vendor/eigen/bench/btl/COPYING +340 -0
- data/vendor/eigen/bench/btl/README +154 -0
- data/vendor/eigen/bench/tensors/README +20 -0
- data/vendor/eigen/blas/README.txt +6 -0
- data/vendor/eigen/ci/README.md +56 -0
- data/vendor/eigen/demos/mandelbrot/README +10 -0
- data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
- data/vendor/eigen/demos/opengl/README +13 -0
- data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1815 -0
- data/vendor/eigen/unsupported/README.txt +50 -0
- data/vendor/tomotopy/LICENSE +21 -0
- data/vendor/tomotopy/README.kr.rst +512 -0
- data/vendor/tomotopy/README.rst +516 -0
- data/vendor/variant/LICENSE +25 -0
- data/vendor/variant/LICENSE_1_0.txt +23 -0
- data/vendor/variant/README.md +102 -0
- metadata +140 -0
data/ext/tomoto/lda.cpp
ADDED
@@ -0,0 +1,281 @@
|
|
1
|
+
#include <fstream>
|
2
|
+
#include <iostream>
|
3
|
+
|
4
|
+
#include <LDA.h>
|
5
|
+
|
6
|
+
#include <rice/rice.hpp>
|
7
|
+
|
8
|
+
#include "utils.h"
|
9
|
+
|
10
|
+
class DocumentObject
|
11
|
+
{
|
12
|
+
public:
|
13
|
+
DocumentObject(const tomoto::DocumentBase* _doc, const tomoto::ITopicModel* _tm) : doc{ _doc }, tm{ _tm } {}
|
14
|
+
|
15
|
+
const tomoto::DocumentBase* doc;
|
16
|
+
const tomoto::ITopicModel* tm;
|
17
|
+
};
|
18
|
+
|
19
|
+
void init_lda(Rice::Module& m) {
|
20
|
+
Rice::define_class_under<DocumentObject>(m, "Document")
|
21
|
+
.define_method(
|
22
|
+
"topics",
|
23
|
+
[](DocumentObject& self) {
|
24
|
+
Rice::Hash res;
|
25
|
+
auto topics = self.tm->getTopicsByDoc(self.doc);
|
26
|
+
for (size_t i = 0; i < topics.size(); i++) {
|
27
|
+
res[i] = topics[i];
|
28
|
+
}
|
29
|
+
return res;
|
30
|
+
});
|
31
|
+
|
32
|
+
Rice::define_class_under<tomoto::ILDAModel>(m, "LDA")
|
33
|
+
.define_singleton_function(
|
34
|
+
"_new",
|
35
|
+
[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, size_t seed) {
|
36
|
+
tomoto::LDAArgs args;
|
37
|
+
args.k = k;
|
38
|
+
args.alpha = {alpha};
|
39
|
+
args.eta = eta;
|
40
|
+
if (seed >= 0) {
|
41
|
+
args.seed = seed;
|
42
|
+
}
|
43
|
+
return tomoto::ILDAModel::create((tomoto::TermWeight)tw, args);
|
44
|
+
}, Rice::Return().takeOwnership())
|
45
|
+
.define_method(
|
46
|
+
"_add_doc",
|
47
|
+
[](tomoto::ILDAModel& self, std::vector<std::string> words) {
|
48
|
+
return self.addDoc(buildDoc(words));
|
49
|
+
})
|
50
|
+
.define_method(
|
51
|
+
"_make_doc",
|
52
|
+
*[](tomoto::ILDAModel& self, std::vector<std::string> words) {
|
53
|
+
return DocumentObject(self.makeDoc(buildDoc(words)).release(), &self);
|
54
|
+
})
|
55
|
+
.define_method(
|
56
|
+
"_infer",
|
57
|
+
*[](tomoto::ILDAModel& self, DocumentObject& doc_object, size_t iteration, float tolerance, size_t workers, size_t ps, size_t together) {
|
58
|
+
std::vector<tomoto::DocumentBase*> docs;
|
59
|
+
auto doc = doc_object.doc;
|
60
|
+
docs.emplace_back(const_cast<tomoto::DocumentBase*>(doc));
|
61
|
+
float ll = self.infer(docs, iteration, tolerance, workers, (tomoto::ParallelScheme)ps, !!together)[0];
|
62
|
+
|
63
|
+
auto topic_dist = self.getTopicsByDoc(doc);
|
64
|
+
auto topic_res = Array();
|
65
|
+
for (size_t i = 0; i < topic_dist.size(); i++) {
|
66
|
+
topic_res.push(topic_dist[i]);
|
67
|
+
}
|
68
|
+
|
69
|
+
auto res = Array();
|
70
|
+
res.push(topic_res);
|
71
|
+
res.push(ll);
|
72
|
+
return res;
|
73
|
+
})
|
74
|
+
.define_method(
|
75
|
+
"alpha",
|
76
|
+
[](tomoto::ILDAModel& self) {
|
77
|
+
Array res;
|
78
|
+
for (size_t i = 0; i < self.getK(); i++) {
|
79
|
+
res.push(self.getAlpha(i));
|
80
|
+
}
|
81
|
+
return res;
|
82
|
+
})
|
83
|
+
.define_method(
|
84
|
+
"burn_in",
|
85
|
+
[](tomoto::ILDAModel& self) {
|
86
|
+
return self.getBurnInIteration();
|
87
|
+
})
|
88
|
+
.define_method(
|
89
|
+
"burn_in=",
|
90
|
+
[](tomoto::ILDAModel& self, size_t iteration) {
|
91
|
+
self.setBurnInIteration(iteration);
|
92
|
+
return iteration;
|
93
|
+
})
|
94
|
+
.define_method(
|
95
|
+
"_count_by_topics",
|
96
|
+
[](tomoto::ILDAModel& self) {
|
97
|
+
Array res;
|
98
|
+
for (auto const& v : self.getCountByTopic()) {
|
99
|
+
res.push(v);
|
100
|
+
}
|
101
|
+
return res;
|
102
|
+
})
|
103
|
+
.define_method(
|
104
|
+
"docs",
|
105
|
+
[](tomoto::ILDAModel& self) {
|
106
|
+
Array res;
|
107
|
+
auto n = self.getNumDocs();
|
108
|
+
for (size_t i = 0; i < n; i++) {
|
109
|
+
auto v = DocumentObject(self.getDoc(i), &self);
|
110
|
+
res.push(Object(Rice::detail::To_Ruby<DocumentObject>().convert(v)));
|
111
|
+
}
|
112
|
+
return res;
|
113
|
+
})
|
114
|
+
.define_method(
|
115
|
+
"eta",
|
116
|
+
[](tomoto::ILDAModel& self) {
|
117
|
+
return self.getEta();
|
118
|
+
})
|
119
|
+
.define_method(
|
120
|
+
"global_step",
|
121
|
+
[](tomoto::ILDAModel& self) {
|
122
|
+
return self.getGlobalStep();
|
123
|
+
})
|
124
|
+
.define_method(
|
125
|
+
"k",
|
126
|
+
[](tomoto::ILDAModel& self) {
|
127
|
+
return self.getK();
|
128
|
+
})
|
129
|
+
.define_method(
|
130
|
+
"_load",
|
131
|
+
[](tomoto::ILDAModel& self, const char* filename) {
|
132
|
+
std::ifstream str{ filename, std::ios_base::binary };
|
133
|
+
if (!str) throw std::runtime_error{ std::string("cannot open file '") + filename + std::string("'") };
|
134
|
+
std::vector<uint8_t> extra_data;
|
135
|
+
self.loadModel(str, &extra_data);
|
136
|
+
})
|
137
|
+
.define_method(
|
138
|
+
"ll_per_word",
|
139
|
+
[](tomoto::ILDAModel& self) {
|
140
|
+
return self.getLLPerWord();
|
141
|
+
})
|
142
|
+
.define_method(
|
143
|
+
"num_docs",
|
144
|
+
[](tomoto::ILDAModel& self) {
|
145
|
+
return self.getNumDocs();
|
146
|
+
})
|
147
|
+
.define_method(
|
148
|
+
"num_vocabs",
|
149
|
+
[](tomoto::ILDAModel& self) {
|
150
|
+
return self.getV();
|
151
|
+
})
|
152
|
+
.define_method(
|
153
|
+
"num_words",
|
154
|
+
[](tomoto::ILDAModel& self) {
|
155
|
+
return self.getN();
|
156
|
+
})
|
157
|
+
.define_method(
|
158
|
+
"optim_interval",
|
159
|
+
[](tomoto::ILDAModel& self) {
|
160
|
+
return self.getOptimInterval();
|
161
|
+
})
|
162
|
+
.define_method(
|
163
|
+
"optim_interval=",
|
164
|
+
[](tomoto::ILDAModel& self, size_t value) {
|
165
|
+
self.setOptimInterval(value);
|
166
|
+
return value;
|
167
|
+
})
|
168
|
+
.define_method(
|
169
|
+
"perplexity",
|
170
|
+
[](tomoto::ILDAModel& self) {
|
171
|
+
return self.getPerplexity();
|
172
|
+
})
|
173
|
+
.define_method(
|
174
|
+
"_prepare",
|
175
|
+
[](tomoto::ILDAModel& self, size_t minCnt, size_t minDf, size_t rmTop) {
|
176
|
+
self.prepare(true, minCnt, minDf, rmTop);
|
177
|
+
})
|
178
|
+
.define_method(
|
179
|
+
"_removed_top_words",
|
180
|
+
[](tomoto::ILDAModel& self, size_t rmTop) {
|
181
|
+
Array res;
|
182
|
+
auto dict = self.getVocabDict();
|
183
|
+
size_t size = dict.size();
|
184
|
+
for (size_t i = rmTop; i > 0; i--) {
|
185
|
+
res.push(dict.toWord(size - i));
|
186
|
+
}
|
187
|
+
return res;
|
188
|
+
})
|
189
|
+
.define_method(
|
190
|
+
"_save",
|
191
|
+
[](tomoto::ILDAModel& self, const char* filename, bool full) {
|
192
|
+
std::ofstream str{ filename, std::ios_base::binary };
|
193
|
+
std::vector<uint8_t> extra_data;
|
194
|
+
self.saveModel(str, full, &extra_data);
|
195
|
+
})
|
196
|
+
.define_method(
|
197
|
+
"_topic_words",
|
198
|
+
[](tomoto::ILDAModel& self, size_t topicId, size_t topN) {
|
199
|
+
Rice::Hash res;
|
200
|
+
for (auto const& v : self.getWordsByTopicSorted(topicId, topN)) {
|
201
|
+
res[v.first] = v.second;
|
202
|
+
}
|
203
|
+
return res;
|
204
|
+
})
|
205
|
+
.define_method(
|
206
|
+
"_train",
|
207
|
+
[](tomoto::ILDAModel& self, size_t iteration, size_t workers, size_t ps) {
|
208
|
+
self.train(iteration, workers, (tomoto::ParallelScheme)ps);
|
209
|
+
})
|
210
|
+
.define_method(
|
211
|
+
"_tw",
|
212
|
+
[](tomoto::ILDAModel& self) {
|
213
|
+
return (int)self.getTermWeight();
|
214
|
+
})
|
215
|
+
.define_method(
|
216
|
+
"used_vocab_df",
|
217
|
+
[](tomoto::ILDAModel& self) {
|
218
|
+
auto vocab = self.getVocabDf();
|
219
|
+
Array res;
|
220
|
+
for (size_t i = 0; i < self.getV(); i++) {
|
221
|
+
res.push(vocab[i]);
|
222
|
+
}
|
223
|
+
return res;
|
224
|
+
})
|
225
|
+
.define_method(
|
226
|
+
"used_vocab_freq",
|
227
|
+
[](tomoto::ILDAModel& self) {
|
228
|
+
auto vocab = self.getVocabCf();
|
229
|
+
Array res;
|
230
|
+
for (size_t i = 0; i < self.getV(); i++) {
|
231
|
+
res.push(vocab[i]);
|
232
|
+
}
|
233
|
+
return res;
|
234
|
+
})
|
235
|
+
.define_method(
|
236
|
+
"used_vocabs",
|
237
|
+
[](tomoto::ILDAModel& self) {
|
238
|
+
auto dict = self.getVocabDict();
|
239
|
+
Array res;
|
240
|
+
auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
|
241
|
+
for (size_t i = 0; i < self.getV(); i++) {
|
242
|
+
VALUE value = Rice::detail::To_Ruby<std::string>().convert(dict.toWord(i));
|
243
|
+
Object obj(value);
|
244
|
+
res.push(obj.call("force_encoding", utf8));
|
245
|
+
}
|
246
|
+
return res;
|
247
|
+
})
|
248
|
+
.define_method(
|
249
|
+
"vocab_df",
|
250
|
+
[](tomoto::ILDAModel& self) {
|
251
|
+
auto vocab = self.getVocabDf();
|
252
|
+
Array res;
|
253
|
+
for (size_t i = 0; i < vocab.size(); i++) {
|
254
|
+
res.push(vocab[i]);
|
255
|
+
}
|
256
|
+
return res;
|
257
|
+
})
|
258
|
+
.define_method(
|
259
|
+
"vocab_freq",
|
260
|
+
[](tomoto::ILDAModel& self) {
|
261
|
+
auto vocab = self.getVocabCf();
|
262
|
+
Array res;
|
263
|
+
for (size_t i = 0; i < vocab.size(); i++) {
|
264
|
+
res.push(vocab[i]);
|
265
|
+
}
|
266
|
+
return res;
|
267
|
+
})
|
268
|
+
.define_method(
|
269
|
+
"vocabs",
|
270
|
+
[](tomoto::ILDAModel& self) {
|
271
|
+
auto dict = self.getVocabDict();
|
272
|
+
Array res;
|
273
|
+
auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
|
274
|
+
for (size_t i = 0; i < dict.size(); i++) {
|
275
|
+
VALUE value = Rice::detail::To_Ruby<std::string>().convert(dict.toWord(i));
|
276
|
+
Object obj(value);
|
277
|
+
res.push(obj.call("force_encoding", utf8));
|
278
|
+
}
|
279
|
+
return res;
|
280
|
+
});
|
281
|
+
}
|
data/ext/tomoto/llda.cpp
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#include <LLDA.h>
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_llda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::ILLDAModel, tomoto::ILDAModel>(m, "LLDA")
|
9
|
+
.define_singleton_function(
|
10
|
+
"_new",
|
11
|
+
[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, size_t seed) {
|
12
|
+
tomoto::LDAArgs args;
|
13
|
+
args.k = k;
|
14
|
+
args.alpha = {alpha};
|
15
|
+
args.eta = eta;
|
16
|
+
if (seed >= 0) {
|
17
|
+
args.seed = seed;
|
18
|
+
}
|
19
|
+
return tomoto::ILLDAModel::create((tomoto::TermWeight)tw, args);
|
20
|
+
}, Rice::Return().takeOwnership())
|
21
|
+
.define_method(
|
22
|
+
"_add_doc",
|
23
|
+
[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
|
24
|
+
auto doc = buildDoc(words);
|
25
|
+
doc.misc["labels"] = labels;
|
26
|
+
return self.addDoc(doc);
|
27
|
+
})
|
28
|
+
.define_method(
|
29
|
+
"topics_per_label",
|
30
|
+
[](tomoto::ILLDAModel& self) {
|
31
|
+
return self.getNumTopicsPerLabel();
|
32
|
+
});
|
33
|
+
}
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#include <MGLDA.h>
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_mglda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IMGLDAModel, tomoto::ILDAModel>(m, "MGLDA")
|
9
|
+
.define_singleton_function(
|
10
|
+
"_new",
|
11
|
+
[](size_t tw, size_t k_g, size_t k_l, size_t t, tomoto::Float alpha_g, tomoto::Float alpha_l, tomoto::Float alpha_mg, tomoto::Float alpha_ml, tomoto::Float eta_g) {
|
12
|
+
tomoto::MGLDAArgs args;
|
13
|
+
args.k = k_g;
|
14
|
+
args.kL = k_l;
|
15
|
+
args.t = t;
|
16
|
+
args.alpha = {alpha_g};
|
17
|
+
args.alphaL = {alpha_l};
|
18
|
+
args.alphaMG = alpha_mg;
|
19
|
+
args.alphaML = alpha_ml;
|
20
|
+
args.eta = eta_g;
|
21
|
+
// TODO more args
|
22
|
+
return tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, args);
|
23
|
+
}, Rice::Return().takeOwnership())
|
24
|
+
.define_method(
|
25
|
+
"_add_doc",
|
26
|
+
[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
|
27
|
+
auto doc = buildDoc(words);
|
28
|
+
doc.misc["delimiter"] = delimiter;
|
29
|
+
return self.addDoc(doc);
|
30
|
+
})
|
31
|
+
.define_method(
|
32
|
+
"alpha_g",
|
33
|
+
[](tomoto::IMGLDAModel& self) {
|
34
|
+
return self.getAlpha();
|
35
|
+
})
|
36
|
+
.define_method(
|
37
|
+
"alpha_l",
|
38
|
+
[](tomoto::IMGLDAModel& self) {
|
39
|
+
return self.getAlphaL();
|
40
|
+
})
|
41
|
+
.define_method(
|
42
|
+
"alpha_mg",
|
43
|
+
[](tomoto::IMGLDAModel& self) {
|
44
|
+
return self.getAlphaM();
|
45
|
+
})
|
46
|
+
.define_method(
|
47
|
+
"alpha_ml",
|
48
|
+
[](tomoto::IMGLDAModel& self) {
|
49
|
+
return self.getAlphaML();
|
50
|
+
})
|
51
|
+
.define_method(
|
52
|
+
"eta_g",
|
53
|
+
[](tomoto::IMGLDAModel& self) {
|
54
|
+
return self.getEta();
|
55
|
+
})
|
56
|
+
.define_method(
|
57
|
+
"eta_l",
|
58
|
+
[](tomoto::IMGLDAModel& self) {
|
59
|
+
return self.getEtaL();
|
60
|
+
})
|
61
|
+
.define_method(
|
62
|
+
"gamma",
|
63
|
+
[](tomoto::IMGLDAModel& self) {
|
64
|
+
return self.getGamma();
|
65
|
+
})
|
66
|
+
.define_method(
|
67
|
+
"k_g",
|
68
|
+
[](tomoto::IMGLDAModel& self) {
|
69
|
+
return self.getK();
|
70
|
+
})
|
71
|
+
.define_method(
|
72
|
+
"k_l",
|
73
|
+
[](tomoto::IMGLDAModel& self) {
|
74
|
+
return self.getKL();
|
75
|
+
})
|
76
|
+
.define_method(
|
77
|
+
"t",
|
78
|
+
[](tomoto::IMGLDAModel& self) {
|
79
|
+
return self.getT();
|
80
|
+
});
|
81
|
+
}
|
data/ext/tomoto/pa.cpp
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
#include <PA.h>
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_pa(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IPAModel, tomoto::ILDAModel>(m, "PA")
|
9
|
+
.define_singleton_function(
|
10
|
+
"_new",
|
11
|
+
[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, size_t seed) {
|
12
|
+
tomoto::PAArgs args;
|
13
|
+
args.k = k1;
|
14
|
+
args.k2 = k2;
|
15
|
+
args.alpha = {alpha};
|
16
|
+
args.eta = eta;
|
17
|
+
if (seed >= 0) {
|
18
|
+
args.seed = seed;
|
19
|
+
}
|
20
|
+
return tomoto::IPAModel::create((tomoto::TermWeight)tw, args);
|
21
|
+
}, Rice::Return().takeOwnership())
|
22
|
+
.define_method(
|
23
|
+
"k1",
|
24
|
+
[](tomoto::IPAModel& self) {
|
25
|
+
return self.getK();
|
26
|
+
})
|
27
|
+
.define_method(
|
28
|
+
"k2",
|
29
|
+
[](tomoto::IPAModel& self) {
|
30
|
+
return self.getK2();
|
31
|
+
});
|
32
|
+
}
|
data/ext/tomoto/plda.cpp
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#include <PLDA.h>
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_plda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(m, "PLDA")
|
9
|
+
.define_singleton_function(
|
10
|
+
"_new",
|
11
|
+
[](size_t tw, size_t latent_topics, tomoto::Float alpha, tomoto::Float eta, size_t seed) {
|
12
|
+
tomoto::PLDAArgs args;
|
13
|
+
args.numLatentTopics = latent_topics;
|
14
|
+
args.alpha = {alpha};
|
15
|
+
args.eta = eta;
|
16
|
+
if (seed >= 0) {
|
17
|
+
args.seed = seed;
|
18
|
+
}
|
19
|
+
return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, args);
|
20
|
+
}, Rice::Return().takeOwnership())
|
21
|
+
.define_method(
|
22
|
+
"_add_doc",
|
23
|
+
[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
|
24
|
+
auto doc = buildDoc(words);
|
25
|
+
doc.misc["labels"] = labels;
|
26
|
+
return self.addDoc(doc);
|
27
|
+
})
|
28
|
+
.define_method(
|
29
|
+
"latent_topics",
|
30
|
+
[](tomoto::IPLDAModel& self) {
|
31
|
+
return self.getNumLatentTopics();
|
32
|
+
});
|
33
|
+
}
|
data/ext/tomoto/slda.cpp
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#include <SLDA.h>
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_slda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(m, "SLDA")
|
9
|
+
.define_singleton_function(
|
10
|
+
"_new",
|
11
|
+
[](size_t tw, size_t k, Array rb_vars, tomoto::Float alpha, tomoto::Float eta, std::vector<tomoto::Float> mu, std::vector<tomoto::Float> nu_sq, std::vector<tomoto::Float> glm_param, size_t seed) {
|
12
|
+
std::vector<tomoto::ISLDAModel::GLM> vars;
|
13
|
+
vars.reserve(rb_vars.size());
|
14
|
+
for (auto const& v : rb_vars) {
|
15
|
+
vars.push_back((tomoto::ISLDAModel::GLM) Rice::detail::From_Ruby<int>().convert(v.value()));
|
16
|
+
}
|
17
|
+
tomoto::SLDAArgs args;
|
18
|
+
args.k = k;
|
19
|
+
args.vars = vars;
|
20
|
+
args.alpha = {alpha};
|
21
|
+
args.eta = eta;
|
22
|
+
args.mu = mu;
|
23
|
+
args.nuSq = nu_sq;
|
24
|
+
args.glmParam = glm_param;
|
25
|
+
if (seed >= 0) {
|
26
|
+
args.seed = seed;
|
27
|
+
}
|
28
|
+
return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, args);
|
29
|
+
}, Rice::Return().takeOwnership())
|
30
|
+
.define_method(
|
31
|
+
"_add_doc",
|
32
|
+
[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
|
33
|
+
auto doc = buildDoc(words);
|
34
|
+
doc.misc["y"] = y;
|
35
|
+
return self.addDoc(doc);
|
36
|
+
})
|
37
|
+
.define_method(
|
38
|
+
"f",
|
39
|
+
[](tomoto::ISLDAModel& self) {
|
40
|
+
return self.getF();
|
41
|
+
})
|
42
|
+
.define_method(
|
43
|
+
"_var_type",
|
44
|
+
[](tomoto::ISLDAModel& self, size_t var_id) {
|
45
|
+
if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
|
46
|
+
return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
|
47
|
+
});
|
48
|
+
}
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#include <rice/rice.hpp>
|
2
|
+
|
3
|
+
void init_lda(Rice::Module& m);
|
4
|
+
void init_ct(Rice::Module& m);
|
5
|
+
void init_dmr(Rice::Module& m);
|
6
|
+
void init_dt(Rice::Module& m);
|
7
|
+
void init_gdmr(Rice::Module& m);
|
8
|
+
void init_hdp(Rice::Module& m);
|
9
|
+
void init_hlda(Rice::Module& m);
|
10
|
+
void init_pa(Rice::Module& m);
|
11
|
+
void init_hpa(Rice::Module& m);
|
12
|
+
void init_mglda(Rice::Module& m);
|
13
|
+
void init_llda(Rice::Module& m);
|
14
|
+
void init_plda(Rice::Module& m);
|
15
|
+
void init_slda(Rice::Module& m);
|
16
|
+
|
17
|
+
extern "C"
|
18
|
+
void Init_tomoto()
|
19
|
+
{
|
20
|
+
auto m = Rice::define_module("Tomoto")
|
21
|
+
.define_singleton_function(
|
22
|
+
"isa",
|
23
|
+
[]() {
|
24
|
+
#ifdef __AVX2__
|
25
|
+
return Rice::String("avx2");
|
26
|
+
#elif defined(__AVX__)
|
27
|
+
return Rice::String("avx");
|
28
|
+
#elif defined(__SSE2__) || defined(__x86_64__) || defined(_WIN64)
|
29
|
+
return Rice::String("sse2");
|
30
|
+
#else
|
31
|
+
return Rice::String("none");
|
32
|
+
#endif
|
33
|
+
});
|
34
|
+
|
35
|
+
init_lda(m);
|
36
|
+
init_ct(m);
|
37
|
+
init_dmr(m);
|
38
|
+
init_dt(m);
|
39
|
+
init_gdmr(m);
|
40
|
+
init_hdp(m);
|
41
|
+
init_hlda(m);
|
42
|
+
init_pa(m);
|
43
|
+
init_hpa(m);
|
44
|
+
init_mglda(m);
|
45
|
+
init_llda(m);
|
46
|
+
init_plda(m);
|
47
|
+
init_slda(m);
|
48
|
+
}
|
data/ext/tomoto/utils.h
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
#include <rice/stl.hpp>
|
5
|
+
|
6
|
+
using Rice::Array;
|
7
|
+
using Rice::Object;
|
8
|
+
|
9
|
+
namespace Rice::detail
|
10
|
+
{
|
11
|
+
template<typename T>
|
12
|
+
class To_Ruby<std::vector<T>>
|
13
|
+
{
|
14
|
+
public:
|
15
|
+
VALUE convert(std::vector<T> const & x)
|
16
|
+
{
|
17
|
+
auto a = rb_ary_new2(x.size());
|
18
|
+
for (const auto& v : x) {
|
19
|
+
detail::protect(rb_ary_push, a, To_Ruby<T>().convert(v));
|
20
|
+
}
|
21
|
+
return a;
|
22
|
+
}
|
23
|
+
};
|
24
|
+
}
|
25
|
+
|
26
|
+
inline tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
|
27
|
+
tomoto::RawDoc doc;
|
28
|
+
doc.rawWords = words;
|
29
|
+
return doc;
|
30
|
+
}
|
Binary file
|
Binary file
|
Binary file
|
data/lib/tomoto/ct.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module Tomoto
|
2
|
+
class CT
|
3
|
+
def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, alpha: 0.1, eta: 0.01, seed: nil)
|
4
|
+
model = _new(to_tw(tw), k, alpha, eta, seed || -1)
|
5
|
+
model.instance_variable_set(:@min_cf, min_cf)
|
6
|
+
model.instance_variable_set(:@min_df, min_df)
|
7
|
+
model.instance_variable_set(:@rm_top, rm_top)
|
8
|
+
init_params(model, binding)
|
9
|
+
end
|
10
|
+
|
11
|
+
def correlations(topic_id = nil)
|
12
|
+
prepare
|
13
|
+
if topic_id
|
14
|
+
_correlations(topic_id)
|
15
|
+
else
|
16
|
+
k.times.map { |i| _correlations(i) }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def prior_cov
|
21
|
+
_prior_cov.each_slice(k).to_a
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/tomoto/dmr.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
module Tomoto
|
2
|
+
class DMR
|
3
|
+
def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, alpha: 0.1, eta: 0.01, sigma: 1.0, alpha_epsilon: 1e-10, seed: nil)
|
4
|
+
model = _new(to_tw(tw), k, alpha, sigma, eta, alpha_epsilon, seed || -1)
|
5
|
+
model.instance_variable_set(:@min_cf, min_cf)
|
6
|
+
model.instance_variable_set(:@min_df, min_df)
|
7
|
+
model.instance_variable_set(:@rm_top, rm_top)
|
8
|
+
init_params(model, binding)
|
9
|
+
end
|
10
|
+
|
11
|
+
def add_doc(doc, metadata: "")
|
12
|
+
_add_doc(prepare_doc(doc), metadata)
|
13
|
+
end
|
14
|
+
|
15
|
+
def lambdas
|
16
|
+
if f == 0
|
17
|
+
[]
|
18
|
+
else
|
19
|
+
k.times.map { |i| _lambdas(i) }
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def alpha
|
24
|
+
lambdas.map { |v| v.map { |v2| Math.exp(v2) } }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/tomoto/dt.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
module Tomoto
|
2
|
+
class DT
|
3
|
+
def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, t: 1, alpha_var: 0.1, eta_var: 0.1, phi_var: 0.1, lr_a: 0.01, lr_b: 0.1, lr_c: 0.55) #, seed: nil)
|
4
|
+
model = _new(to_tw(tw), k, t, alpha_var, eta_var, phi_var, lr_a, lr_b, lr_c)
|
5
|
+
model.instance_variable_set(:@min_cf, min_cf)
|
6
|
+
model.instance_variable_set(:@min_df, min_df)
|
7
|
+
model.instance_variable_set(:@rm_top, rm_top)
|
8
|
+
init_params(model, binding)
|
9
|
+
end
|
10
|
+
|
11
|
+
def add_doc(doc, timepoint: 0)
|
12
|
+
_add_doc(prepare_doc(doc), timepoint)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/tomoto/gdmr.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
module Tomoto
|
2
|
+
class GDMR
|
3
|
+
def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, degrees: [], alpha: 0.1, eta: 0.01, sigma: 1.0, sigma0: 3.0, alpha_epsilon: 1e-10, seed: nil)
|
4
|
+
model = _new(to_tw(tw), k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed || -1)
|
5
|
+
model.instance_variable_set(:@min_cf, min_cf)
|
6
|
+
model.instance_variable_set(:@min_df, min_df)
|
7
|
+
model.instance_variable_set(:@rm_top, rm_top)
|
8
|
+
init_params(model, binding)
|
9
|
+
end
|
10
|
+
|
11
|
+
def add_doc(doc, numeric_metadata: [])
|
12
|
+
_add_doc(prepare_doc(doc), numeric_metadata)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/tomoto/hdp.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
module Tomoto
|
2
|
+
class HDP
|
3
|
+
def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, initial_k: 2, alpha: 0.1, eta: 0.01, gamma: 0.1, seed: nil)
|
4
|
+
model = _new(to_tw(tw), initial_k, alpha, eta, gamma, seed || -1)
|
5
|
+
model.instance_variable_set(:@min_cf, min_cf)
|
6
|
+
model.instance_variable_set(:@min_df, min_df)
|
7
|
+
model.instance_variable_set(:@rm_top, rm_top)
|
8
|
+
init_params(model, binding)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|