tomoto 0.4.0-aarch64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +65 -0
- data/LICENSE.txt +22 -0
- data/README.md +154 -0
- data/ext/tomoto/ct.cpp +58 -0
- data/ext/tomoto/dmr.cpp +69 -0
- data/ext/tomoto/dt.cpp +91 -0
- data/ext/tomoto/extconf.rb +42 -0
- data/ext/tomoto/gdmr.cpp +42 -0
- data/ext/tomoto/hdp.cpp +47 -0
- data/ext/tomoto/hlda.cpp +71 -0
- data/ext/tomoto/hpa.cpp +32 -0
- data/ext/tomoto/lda.cpp +281 -0
- data/ext/tomoto/llda.cpp +46 -0
- data/ext/tomoto/mglda.cpp +81 -0
- data/ext/tomoto/pa.cpp +32 -0
- data/ext/tomoto/plda.cpp +33 -0
- data/ext/tomoto/slda.cpp +48 -0
- data/ext/tomoto/tomoto.cpp +48 -0
- data/ext/tomoto/utils.h +30 -0
- data/lib/tomoto/3.0/tomoto.so +0 -0
- data/lib/tomoto/3.1/tomoto.so +0 -0
- data/lib/tomoto/3.2/tomoto.so +0 -0
- data/lib/tomoto/3.3/tomoto.so +0 -0
- data/lib/tomoto/ct.rb +24 -0
- data/lib/tomoto/dmr.rb +27 -0
- data/lib/tomoto/dt.rb +15 -0
- data/lib/tomoto/gdmr.rb +15 -0
- data/lib/tomoto/hdp.rb +11 -0
- data/lib/tomoto/hlda.rb +56 -0
- data/lib/tomoto/hpa.rb +11 -0
- data/lib/tomoto/lda.rb +186 -0
- data/lib/tomoto/llda.rb +15 -0
- data/lib/tomoto/mglda.rb +15 -0
- data/lib/tomoto/pa.rb +11 -0
- data/lib/tomoto/plda.rb +15 -0
- data/lib/tomoto/slda.rb +37 -0
- data/lib/tomoto/version.rb +3 -0
- data/lib/tomoto.rb +27 -0
- data/vendor/EigenRand/EigenRand/EigenRand +24 -0
- data/vendor/EigenRand/LICENSE +21 -0
- data/vendor/EigenRand/README.md +430 -0
- data/vendor/eigen/COPYING.APACHE +203 -0
- data/vendor/eigen/COPYING.BSD +26 -0
- data/vendor/eigen/COPYING.GPL +674 -0
- data/vendor/eigen/COPYING.LGPL +502 -0
- data/vendor/eigen/COPYING.MINPACK +51 -0
- data/vendor/eigen/COPYING.MPL2 +373 -0
- data/vendor/eigen/COPYING.README +18 -0
- data/vendor/eigen/Eigen/Cholesky +45 -0
- data/vendor/eigen/Eigen/CholmodSupport +48 -0
- data/vendor/eigen/Eigen/Core +384 -0
- data/vendor/eigen/Eigen/Dense +7 -0
- data/vendor/eigen/Eigen/Eigen +2 -0
- data/vendor/eigen/Eigen/Eigenvalues +60 -0
- data/vendor/eigen/Eigen/Geometry +59 -0
- data/vendor/eigen/Eigen/Householder +29 -0
- data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
- data/vendor/eigen/Eigen/Jacobi +32 -0
- data/vendor/eigen/Eigen/KLUSupport +41 -0
- data/vendor/eigen/Eigen/LU +47 -0
- data/vendor/eigen/Eigen/MetisSupport +35 -0
- data/vendor/eigen/Eigen/OrderingMethods +70 -0
- data/vendor/eigen/Eigen/PaStiXSupport +49 -0
- data/vendor/eigen/Eigen/PardisoSupport +35 -0
- data/vendor/eigen/Eigen/QR +50 -0
- data/vendor/eigen/Eigen/QtAlignedMalloc +39 -0
- data/vendor/eigen/Eigen/SPQRSupport +34 -0
- data/vendor/eigen/Eigen/SVD +50 -0
- data/vendor/eigen/Eigen/Sparse +34 -0
- data/vendor/eigen/Eigen/SparseCholesky +37 -0
- data/vendor/eigen/Eigen/SparseCore +69 -0
- data/vendor/eigen/Eigen/SparseLU +50 -0
- data/vendor/eigen/Eigen/SparseQR +36 -0
- data/vendor/eigen/Eigen/StdDeque +27 -0
- data/vendor/eigen/Eigen/StdList +26 -0
- data/vendor/eigen/Eigen/StdVector +27 -0
- data/vendor/eigen/Eigen/SuperLUSupport +64 -0
- data/vendor/eigen/Eigen/UmfPackSupport +40 -0
- data/vendor/eigen/README.md +5 -0
- data/vendor/eigen/bench/README.txt +55 -0
- data/vendor/eigen/bench/btl/COPYING +340 -0
- data/vendor/eigen/bench/btl/README +154 -0
- data/vendor/eigen/bench/tensors/README +20 -0
- data/vendor/eigen/blas/README.txt +6 -0
- data/vendor/eigen/ci/README.md +56 -0
- data/vendor/eigen/demos/mandelbrot/README +10 -0
- data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
- data/vendor/eigen/demos/opengl/README +13 -0
- data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1815 -0
- data/vendor/eigen/unsupported/README.txt +50 -0
- data/vendor/tomotopy/LICENSE +21 -0
- data/vendor/tomotopy/README.kr.rst +536 -0
- data/vendor/tomotopy/README.rst +555 -0
- data/vendor/variant/LICENSE +25 -0
- data/vendor/variant/LICENSE_1_0.txt +23 -0
- data/vendor/variant/README.md +102 -0
- metadata +141 -0
data/ext/tomoto/hpa.cpp
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
#include <HPA.h>
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_hpa(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IHPAModel, tomoto::IPAModel>(m, "HPA")
|
9
|
+
.define_singleton_function(
|
10
|
+
"_new",
|
11
|
+
[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, size_t seed) {
|
12
|
+
tomoto::HPAArgs args;
|
13
|
+
args.k = k1;
|
14
|
+
args.k2 = k2;
|
15
|
+
args.alpha = {alpha};
|
16
|
+
args.eta = eta;
|
17
|
+
if (seed >= 0) {
|
18
|
+
args.seed = seed;
|
19
|
+
}
|
20
|
+
return tomoto::IHPAModel::create((tomoto::TermWeight)tw, false, args);
|
21
|
+
}, Rice::Return().takeOwnership())
|
22
|
+
.define_method(
|
23
|
+
"alpha",
|
24
|
+
[](tomoto::IHPAModel& self) {
|
25
|
+
Array res;
|
26
|
+
// use <= to return k+1 elements
|
27
|
+
for (size_t i = 0; i <= self.getK(); i++) {
|
28
|
+
res.push(self.getAlpha(i));
|
29
|
+
}
|
30
|
+
return res;
|
31
|
+
});
|
32
|
+
}
|
data/ext/tomoto/lda.cpp
ADDED
@@ -0,0 +1,281 @@
|
|
1
|
+
#include <fstream>
|
2
|
+
#include <iostream>
|
3
|
+
|
4
|
+
#include <LDA.h>
|
5
|
+
|
6
|
+
#include <rice/rice.hpp>
|
7
|
+
|
8
|
+
#include "utils.h"
|
9
|
+
|
10
|
+
class DocumentObject
|
11
|
+
{
|
12
|
+
public:
|
13
|
+
DocumentObject(const tomoto::DocumentBase* _doc, const tomoto::ITopicModel* _tm) : doc{ _doc }, tm{ _tm } {}
|
14
|
+
|
15
|
+
const tomoto::DocumentBase* doc;
|
16
|
+
const tomoto::ITopicModel* tm;
|
17
|
+
};
|
18
|
+
|
19
|
+
void init_lda(Rice::Module& m) {
|
20
|
+
Rice::define_class_under<DocumentObject>(m, "Document")
|
21
|
+
.define_method(
|
22
|
+
"topics",
|
23
|
+
[](DocumentObject& self) {
|
24
|
+
Rice::Hash res;
|
25
|
+
auto topics = self.tm->getTopicsByDoc(self.doc);
|
26
|
+
for (size_t i = 0; i < topics.size(); i++) {
|
27
|
+
res[i] = topics[i];
|
28
|
+
}
|
29
|
+
return res;
|
30
|
+
});
|
31
|
+
|
32
|
+
Rice::define_class_under<tomoto::ILDAModel>(m, "LDA")
|
33
|
+
.define_singleton_function(
|
34
|
+
"_new",
|
35
|
+
[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, size_t seed) {
|
36
|
+
tomoto::LDAArgs args;
|
37
|
+
args.k = k;
|
38
|
+
args.alpha = {alpha};
|
39
|
+
args.eta = eta;
|
40
|
+
if (seed >= 0) {
|
41
|
+
args.seed = seed;
|
42
|
+
}
|
43
|
+
return tomoto::ILDAModel::create((tomoto::TermWeight)tw, args);
|
44
|
+
}, Rice::Return().takeOwnership())
|
45
|
+
.define_method(
|
46
|
+
"_add_doc",
|
47
|
+
[](tomoto::ILDAModel& self, std::vector<std::string> words) {
|
48
|
+
return self.addDoc(buildDoc(words));
|
49
|
+
})
|
50
|
+
.define_method(
|
51
|
+
"_make_doc",
|
52
|
+
[](tomoto::ILDAModel& self, std::vector<std::string> words) {
|
53
|
+
return DocumentObject(self.makeDoc(buildDoc(words)).release(), &self);
|
54
|
+
})
|
55
|
+
.define_method(
|
56
|
+
"_infer",
|
57
|
+
[](tomoto::ILDAModel& self, DocumentObject& doc_object, size_t iteration, float tolerance, size_t workers, size_t ps, size_t together) {
|
58
|
+
std::vector<tomoto::DocumentBase*> docs;
|
59
|
+
auto doc = doc_object.doc;
|
60
|
+
docs.emplace_back(const_cast<tomoto::DocumentBase*>(doc));
|
61
|
+
float ll = self.infer(docs, iteration, tolerance, workers, (tomoto::ParallelScheme)ps, !!together)[0];
|
62
|
+
|
63
|
+
auto topic_dist = self.getTopicsByDoc(doc);
|
64
|
+
auto topic_res = Array();
|
65
|
+
for (size_t i = 0; i < topic_dist.size(); i++) {
|
66
|
+
topic_res.push(topic_dist[i]);
|
67
|
+
}
|
68
|
+
|
69
|
+
auto res = Array();
|
70
|
+
res.push(topic_res);
|
71
|
+
res.push(ll);
|
72
|
+
return res;
|
73
|
+
})
|
74
|
+
.define_method(
|
75
|
+
"alpha",
|
76
|
+
[](tomoto::ILDAModel& self) {
|
77
|
+
Array res;
|
78
|
+
for (size_t i = 0; i < self.getK(); i++) {
|
79
|
+
res.push(self.getAlpha(i));
|
80
|
+
}
|
81
|
+
return res;
|
82
|
+
})
|
83
|
+
.define_method(
|
84
|
+
"burn_in",
|
85
|
+
[](tomoto::ILDAModel& self) {
|
86
|
+
return self.getBurnInIteration();
|
87
|
+
})
|
88
|
+
.define_method(
|
89
|
+
"burn_in=",
|
90
|
+
[](tomoto::ILDAModel& self, size_t iteration) {
|
91
|
+
self.setBurnInIteration(iteration);
|
92
|
+
return iteration;
|
93
|
+
})
|
94
|
+
.define_method(
|
95
|
+
"_count_by_topics",
|
96
|
+
[](tomoto::ILDAModel& self) {
|
97
|
+
Array res;
|
98
|
+
for (auto const& v : self.getCountByTopic()) {
|
99
|
+
res.push(v);
|
100
|
+
}
|
101
|
+
return res;
|
102
|
+
})
|
103
|
+
.define_method(
|
104
|
+
"docs",
|
105
|
+
[](tomoto::ILDAModel& self) {
|
106
|
+
Array res;
|
107
|
+
auto n = self.getNumDocs();
|
108
|
+
for (size_t i = 0; i < n; i++) {
|
109
|
+
auto v = DocumentObject(self.getDoc(i), &self);
|
110
|
+
res.push(Object(Rice::detail::To_Ruby<DocumentObject>().convert(v)));
|
111
|
+
}
|
112
|
+
return res;
|
113
|
+
})
|
114
|
+
.define_method(
|
115
|
+
"eta",
|
116
|
+
[](tomoto::ILDAModel& self) {
|
117
|
+
return self.getEta();
|
118
|
+
})
|
119
|
+
.define_method(
|
120
|
+
"global_step",
|
121
|
+
[](tomoto::ILDAModel& self) {
|
122
|
+
return self.getGlobalStep();
|
123
|
+
})
|
124
|
+
.define_method(
|
125
|
+
"k",
|
126
|
+
[](tomoto::ILDAModel& self) {
|
127
|
+
return self.getK();
|
128
|
+
})
|
129
|
+
.define_method(
|
130
|
+
"_load",
|
131
|
+
[](tomoto::ILDAModel& self, const char* filename) {
|
132
|
+
std::ifstream str{ filename, std::ios_base::binary };
|
133
|
+
if (!str) throw std::runtime_error{ std::string("cannot open file '") + filename + std::string("'") };
|
134
|
+
std::vector<uint8_t> extra_data;
|
135
|
+
self.loadModel(str, &extra_data);
|
136
|
+
})
|
137
|
+
.define_method(
|
138
|
+
"ll_per_word",
|
139
|
+
[](tomoto::ILDAModel& self) {
|
140
|
+
return self.getLLPerWord();
|
141
|
+
})
|
142
|
+
.define_method(
|
143
|
+
"num_docs",
|
144
|
+
[](tomoto::ILDAModel& self) {
|
145
|
+
return self.getNumDocs();
|
146
|
+
})
|
147
|
+
.define_method(
|
148
|
+
"num_vocabs",
|
149
|
+
[](tomoto::ILDAModel& self) {
|
150
|
+
return self.getV();
|
151
|
+
})
|
152
|
+
.define_method(
|
153
|
+
"num_words",
|
154
|
+
[](tomoto::ILDAModel& self) {
|
155
|
+
return self.getN();
|
156
|
+
})
|
157
|
+
.define_method(
|
158
|
+
"optim_interval",
|
159
|
+
[](tomoto::ILDAModel& self) {
|
160
|
+
return self.getOptimInterval();
|
161
|
+
})
|
162
|
+
.define_method(
|
163
|
+
"optim_interval=",
|
164
|
+
[](tomoto::ILDAModel& self, size_t value) {
|
165
|
+
self.setOptimInterval(value);
|
166
|
+
return value;
|
167
|
+
})
|
168
|
+
.define_method(
|
169
|
+
"perplexity",
|
170
|
+
[](tomoto::ILDAModel& self) {
|
171
|
+
return self.getPerplexity();
|
172
|
+
})
|
173
|
+
.define_method(
|
174
|
+
"_prepare",
|
175
|
+
[](tomoto::ILDAModel& self, size_t minCnt, size_t minDf, size_t rmTop) {
|
176
|
+
self.prepare(true, minCnt, minDf, rmTop);
|
177
|
+
})
|
178
|
+
.define_method(
|
179
|
+
"_removed_top_words",
|
180
|
+
[](tomoto::ILDAModel& self, size_t rmTop) {
|
181
|
+
Array res;
|
182
|
+
auto dict = self.getVocabDict();
|
183
|
+
size_t size = dict.size();
|
184
|
+
for (size_t i = rmTop; i > 0; i--) {
|
185
|
+
res.push(dict.toWord(size - i));
|
186
|
+
}
|
187
|
+
return res;
|
188
|
+
})
|
189
|
+
.define_method(
|
190
|
+
"_save",
|
191
|
+
[](tomoto::ILDAModel& self, const char* filename, bool full) {
|
192
|
+
std::ofstream str{ filename, std::ios_base::binary };
|
193
|
+
std::vector<uint8_t> extra_data;
|
194
|
+
self.saveModel(str, full, &extra_data);
|
195
|
+
})
|
196
|
+
.define_method(
|
197
|
+
"_topic_words",
|
198
|
+
[](tomoto::ILDAModel& self, size_t topicId, size_t topN) {
|
199
|
+
Rice::Hash res;
|
200
|
+
for (auto const& v : self.getWordsByTopicSorted(topicId, topN)) {
|
201
|
+
res[v.first] = v.second;
|
202
|
+
}
|
203
|
+
return res;
|
204
|
+
})
|
205
|
+
.define_method(
|
206
|
+
"_train",
|
207
|
+
[](tomoto::ILDAModel& self, size_t iteration, size_t workers, size_t ps) {
|
208
|
+
self.train(iteration, workers, (tomoto::ParallelScheme)ps);
|
209
|
+
})
|
210
|
+
.define_method(
|
211
|
+
"_tw",
|
212
|
+
[](tomoto::ILDAModel& self) {
|
213
|
+
return (int)self.getTermWeight();
|
214
|
+
})
|
215
|
+
.define_method(
|
216
|
+
"used_vocab_df",
|
217
|
+
[](tomoto::ILDAModel& self) {
|
218
|
+
auto vocab = self.getVocabDf();
|
219
|
+
Array res;
|
220
|
+
for (size_t i = 0; i < self.getV(); i++) {
|
221
|
+
res.push(vocab[i]);
|
222
|
+
}
|
223
|
+
return res;
|
224
|
+
})
|
225
|
+
.define_method(
|
226
|
+
"used_vocab_freq",
|
227
|
+
[](tomoto::ILDAModel& self) {
|
228
|
+
auto vocab = self.getVocabCf();
|
229
|
+
Array res;
|
230
|
+
for (size_t i = 0; i < self.getV(); i++) {
|
231
|
+
res.push(vocab[i]);
|
232
|
+
}
|
233
|
+
return res;
|
234
|
+
})
|
235
|
+
.define_method(
|
236
|
+
"used_vocabs",
|
237
|
+
[](tomoto::ILDAModel& self) {
|
238
|
+
auto dict = self.getVocabDict();
|
239
|
+
Array res;
|
240
|
+
auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
|
241
|
+
for (size_t i = 0; i < self.getV(); i++) {
|
242
|
+
VALUE value = Rice::detail::To_Ruby<std::string>().convert(dict.toWord(i));
|
243
|
+
Object obj(value);
|
244
|
+
res.push(obj.call("force_encoding", utf8));
|
245
|
+
}
|
246
|
+
return res;
|
247
|
+
})
|
248
|
+
.define_method(
|
249
|
+
"vocab_df",
|
250
|
+
[](tomoto::ILDAModel& self) {
|
251
|
+
auto vocab = self.getVocabDf();
|
252
|
+
Array res;
|
253
|
+
for (size_t i = 0; i < vocab.size(); i++) {
|
254
|
+
res.push(vocab[i]);
|
255
|
+
}
|
256
|
+
return res;
|
257
|
+
})
|
258
|
+
.define_method(
|
259
|
+
"vocab_freq",
|
260
|
+
[](tomoto::ILDAModel& self) {
|
261
|
+
auto vocab = self.getVocabCf();
|
262
|
+
Array res;
|
263
|
+
for (size_t i = 0; i < vocab.size(); i++) {
|
264
|
+
res.push(vocab[i]);
|
265
|
+
}
|
266
|
+
return res;
|
267
|
+
})
|
268
|
+
.define_method(
|
269
|
+
"vocabs",
|
270
|
+
[](tomoto::ILDAModel& self) {
|
271
|
+
auto dict = self.getVocabDict();
|
272
|
+
Array res;
|
273
|
+
auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
|
274
|
+
for (size_t i = 0; i < dict.size(); i++) {
|
275
|
+
VALUE value = Rice::detail::To_Ruby<std::string>().convert(dict.toWord(i));
|
276
|
+
Object obj(value);
|
277
|
+
res.push(obj.call("force_encoding", utf8));
|
278
|
+
}
|
279
|
+
return res;
|
280
|
+
});
|
281
|
+
}
|
data/ext/tomoto/llda.cpp
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#include <LLDA.h>
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_llda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::ILLDAModel, tomoto::ILDAModel>(m, "LLDA")
|
9
|
+
.define_singleton_function(
|
10
|
+
"_new",
|
11
|
+
[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, size_t seed) {
|
12
|
+
tomoto::LDAArgs args;
|
13
|
+
args.k = k;
|
14
|
+
args.alpha = {alpha};
|
15
|
+
args.eta = eta;
|
16
|
+
if (seed >= 0) {
|
17
|
+
args.seed = seed;
|
18
|
+
}
|
19
|
+
return tomoto::ILLDAModel::create((tomoto::TermWeight)tw, args);
|
20
|
+
}, Rice::Return().takeOwnership())
|
21
|
+
.define_method(
|
22
|
+
"_add_doc",
|
23
|
+
[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
|
24
|
+
auto doc = buildDoc(words);
|
25
|
+
doc.misc["labels"] = labels;
|
26
|
+
return self.addDoc(doc);
|
27
|
+
})
|
28
|
+
.define_method(
|
29
|
+
"topics_per_label",
|
30
|
+
[](tomoto::ILLDAModel& self) {
|
31
|
+
return self.getNumTopicsPerLabel();
|
32
|
+
})
|
33
|
+
.define_method(
|
34
|
+
"topic_label_dict",
|
35
|
+
[](tomoto::ILLDAModel& self) {
|
36
|
+
auto dict = self.getTopicLabelDict();
|
37
|
+
Array res;
|
38
|
+
auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
|
39
|
+
for (size_t i = 0; i < dict.size(); i++) {
|
40
|
+
VALUE value = Rice::detail::To_Ruby<std::string>().convert(dict.toWord(i));
|
41
|
+
Object obj(value);
|
42
|
+
res.push(obj.call("force_encoding", utf8));
|
43
|
+
}
|
44
|
+
return res;
|
45
|
+
});
|
46
|
+
}
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#include <MGLDA.h>
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_mglda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IMGLDAModel, tomoto::ILDAModel>(m, "MGLDA")
|
9
|
+
.define_singleton_function(
|
10
|
+
"_new",
|
11
|
+
[](size_t tw, size_t k_g, size_t k_l, size_t t, tomoto::Float alpha_g, tomoto::Float alpha_l, tomoto::Float alpha_mg, tomoto::Float alpha_ml, tomoto::Float eta_g) {
|
12
|
+
tomoto::MGLDAArgs args;
|
13
|
+
args.k = k_g;
|
14
|
+
args.kL = k_l;
|
15
|
+
args.t = t;
|
16
|
+
args.alpha = {alpha_g};
|
17
|
+
args.alphaL = {alpha_l};
|
18
|
+
args.alphaMG = alpha_mg;
|
19
|
+
args.alphaML = alpha_ml;
|
20
|
+
args.eta = eta_g;
|
21
|
+
// TODO more args
|
22
|
+
return tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, args);
|
23
|
+
}, Rice::Return().takeOwnership())
|
24
|
+
.define_method(
|
25
|
+
"_add_doc",
|
26
|
+
[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
|
27
|
+
auto doc = buildDoc(words);
|
28
|
+
doc.misc["delimiter"] = delimiter;
|
29
|
+
return self.addDoc(doc);
|
30
|
+
})
|
31
|
+
.define_method(
|
32
|
+
"alpha_g",
|
33
|
+
[](tomoto::IMGLDAModel& self) {
|
34
|
+
return self.getAlpha();
|
35
|
+
})
|
36
|
+
.define_method(
|
37
|
+
"alpha_l",
|
38
|
+
[](tomoto::IMGLDAModel& self) {
|
39
|
+
return self.getAlphaL();
|
40
|
+
})
|
41
|
+
.define_method(
|
42
|
+
"alpha_mg",
|
43
|
+
[](tomoto::IMGLDAModel& self) {
|
44
|
+
return self.getAlphaM();
|
45
|
+
})
|
46
|
+
.define_method(
|
47
|
+
"alpha_ml",
|
48
|
+
[](tomoto::IMGLDAModel& self) {
|
49
|
+
return self.getAlphaML();
|
50
|
+
})
|
51
|
+
.define_method(
|
52
|
+
"eta_g",
|
53
|
+
[](tomoto::IMGLDAModel& self) {
|
54
|
+
return self.getEta();
|
55
|
+
})
|
56
|
+
.define_method(
|
57
|
+
"eta_l",
|
58
|
+
[](tomoto::IMGLDAModel& self) {
|
59
|
+
return self.getEtaL();
|
60
|
+
})
|
61
|
+
.define_method(
|
62
|
+
"gamma",
|
63
|
+
[](tomoto::IMGLDAModel& self) {
|
64
|
+
return self.getGamma();
|
65
|
+
})
|
66
|
+
.define_method(
|
67
|
+
"k_g",
|
68
|
+
[](tomoto::IMGLDAModel& self) {
|
69
|
+
return self.getK();
|
70
|
+
})
|
71
|
+
.define_method(
|
72
|
+
"k_l",
|
73
|
+
[](tomoto::IMGLDAModel& self) {
|
74
|
+
return self.getKL();
|
75
|
+
})
|
76
|
+
.define_method(
|
77
|
+
"t",
|
78
|
+
[](tomoto::IMGLDAModel& self) {
|
79
|
+
return self.getT();
|
80
|
+
});
|
81
|
+
}
|
data/ext/tomoto/pa.cpp
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
#include <PA.h>
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_pa(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IPAModel, tomoto::ILDAModel>(m, "PA")
|
9
|
+
.define_singleton_function(
|
10
|
+
"_new",
|
11
|
+
[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, size_t seed) {
|
12
|
+
tomoto::PAArgs args;
|
13
|
+
args.k = k1;
|
14
|
+
args.k2 = k2;
|
15
|
+
args.alpha = {alpha};
|
16
|
+
args.eta = eta;
|
17
|
+
if (seed >= 0) {
|
18
|
+
args.seed = seed;
|
19
|
+
}
|
20
|
+
return tomoto::IPAModel::create((tomoto::TermWeight)tw, args);
|
21
|
+
}, Rice::Return().takeOwnership())
|
22
|
+
.define_method(
|
23
|
+
"k1",
|
24
|
+
[](tomoto::IPAModel& self) {
|
25
|
+
return self.getK();
|
26
|
+
})
|
27
|
+
.define_method(
|
28
|
+
"k2",
|
29
|
+
[](tomoto::IPAModel& self) {
|
30
|
+
return self.getK2();
|
31
|
+
});
|
32
|
+
}
|
data/ext/tomoto/plda.cpp
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#include <PLDA.h>
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_plda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(m, "PLDA")
|
9
|
+
.define_singleton_function(
|
10
|
+
"_new",
|
11
|
+
[](size_t tw, size_t latent_topics, tomoto::Float alpha, tomoto::Float eta, size_t seed) {
|
12
|
+
tomoto::PLDAArgs args;
|
13
|
+
args.numLatentTopics = latent_topics;
|
14
|
+
args.alpha = {alpha};
|
15
|
+
args.eta = eta;
|
16
|
+
if (seed >= 0) {
|
17
|
+
args.seed = seed;
|
18
|
+
}
|
19
|
+
return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, args);
|
20
|
+
}, Rice::Return().takeOwnership())
|
21
|
+
.define_method(
|
22
|
+
"_add_doc",
|
23
|
+
[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
|
24
|
+
auto doc = buildDoc(words);
|
25
|
+
doc.misc["labels"] = labels;
|
26
|
+
return self.addDoc(doc);
|
27
|
+
})
|
28
|
+
.define_method(
|
29
|
+
"latent_topics",
|
30
|
+
[](tomoto::IPLDAModel& self) {
|
31
|
+
return self.getNumLatentTopics();
|
32
|
+
});
|
33
|
+
}
|
data/ext/tomoto/slda.cpp
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#include <SLDA.h>
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void init_slda(Rice::Module& m) {
|
8
|
+
Rice::define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(m, "SLDA")
|
9
|
+
.define_singleton_function(
|
10
|
+
"_new",
|
11
|
+
[](size_t tw, size_t k, Array rb_vars, tomoto::Float alpha, tomoto::Float eta, std::vector<tomoto::Float> mu, std::vector<tomoto::Float> nu_sq, std::vector<tomoto::Float> glm_param, size_t seed) {
|
12
|
+
std::vector<tomoto::ISLDAModel::GLM> vars;
|
13
|
+
vars.reserve(rb_vars.size());
|
14
|
+
for (auto const& v : rb_vars) {
|
15
|
+
vars.push_back((tomoto::ISLDAModel::GLM) Rice::detail::From_Ruby<int>().convert(v.value()));
|
16
|
+
}
|
17
|
+
tomoto::SLDAArgs args;
|
18
|
+
args.k = k;
|
19
|
+
args.vars = vars;
|
20
|
+
args.alpha = {alpha};
|
21
|
+
args.eta = eta;
|
22
|
+
args.mu = mu;
|
23
|
+
args.nuSq = nu_sq;
|
24
|
+
args.glmParam = glm_param;
|
25
|
+
if (seed >= 0) {
|
26
|
+
args.seed = seed;
|
27
|
+
}
|
28
|
+
return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, args);
|
29
|
+
}, Rice::Return().takeOwnership())
|
30
|
+
.define_method(
|
31
|
+
"_add_doc",
|
32
|
+
[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
|
33
|
+
auto doc = buildDoc(words);
|
34
|
+
doc.misc["y"] = y;
|
35
|
+
return self.addDoc(doc);
|
36
|
+
})
|
37
|
+
.define_method(
|
38
|
+
"f",
|
39
|
+
[](tomoto::ISLDAModel& self) {
|
40
|
+
return self.getF();
|
41
|
+
})
|
42
|
+
.define_method(
|
43
|
+
"_var_type",
|
44
|
+
[](tomoto::ISLDAModel& self, size_t var_id) {
|
45
|
+
if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
|
46
|
+
return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
|
47
|
+
});
|
48
|
+
}
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#include <rice/rice.hpp>
|
2
|
+
|
3
|
+
void init_lda(Rice::Module& m);
|
4
|
+
void init_ct(Rice::Module& m);
|
5
|
+
void init_dmr(Rice::Module& m);
|
6
|
+
void init_dt(Rice::Module& m);
|
7
|
+
void init_gdmr(Rice::Module& m);
|
8
|
+
void init_hdp(Rice::Module& m);
|
9
|
+
void init_hlda(Rice::Module& m);
|
10
|
+
void init_pa(Rice::Module& m);
|
11
|
+
void init_hpa(Rice::Module& m);
|
12
|
+
void init_mglda(Rice::Module& m);
|
13
|
+
void init_llda(Rice::Module& m);
|
14
|
+
void init_plda(Rice::Module& m);
|
15
|
+
void init_slda(Rice::Module& m);
|
16
|
+
|
17
|
+
extern "C"
|
18
|
+
void Init_tomoto()
|
19
|
+
{
|
20
|
+
auto m = Rice::define_module("Tomoto")
|
21
|
+
.define_singleton_function(
|
22
|
+
"isa",
|
23
|
+
[]() {
|
24
|
+
#ifdef __AVX2__
|
25
|
+
return Rice::String("avx2");
|
26
|
+
#elif defined(__AVX__)
|
27
|
+
return Rice::String("avx");
|
28
|
+
#elif defined(__SSE2__) || defined(__x86_64__) || defined(_WIN64)
|
29
|
+
return Rice::String("sse2");
|
30
|
+
#else
|
31
|
+
return Rice::String("none");
|
32
|
+
#endif
|
33
|
+
});
|
34
|
+
|
35
|
+
init_lda(m);
|
36
|
+
init_ct(m);
|
37
|
+
init_dmr(m);
|
38
|
+
init_dt(m);
|
39
|
+
init_gdmr(m);
|
40
|
+
init_hdp(m);
|
41
|
+
init_hlda(m);
|
42
|
+
init_pa(m);
|
43
|
+
init_hpa(m);
|
44
|
+
init_mglda(m);
|
45
|
+
init_llda(m);
|
46
|
+
init_plda(m);
|
47
|
+
init_slda(m);
|
48
|
+
}
|
data/ext/tomoto/utils.h
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
#include <rice/stl.hpp>
|
5
|
+
|
6
|
+
using Rice::Array;
|
7
|
+
using Rice::Object;
|
8
|
+
|
9
|
+
namespace Rice::detail
|
10
|
+
{
|
11
|
+
template<typename T>
|
12
|
+
class To_Ruby<std::vector<T>>
|
13
|
+
{
|
14
|
+
public:
|
15
|
+
VALUE convert(std::vector<T> const & x)
|
16
|
+
{
|
17
|
+
auto a = rb_ary_new2(x.size());
|
18
|
+
for (const auto& v : x) {
|
19
|
+
detail::protect(rb_ary_push, a, To_Ruby<T>().convert(v));
|
20
|
+
}
|
21
|
+
return a;
|
22
|
+
}
|
23
|
+
};
|
24
|
+
}
|
25
|
+
|
26
|
+
inline tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
|
27
|
+
tomoto::RawDoc doc;
|
28
|
+
doc.rawWords = words;
|
29
|
+
return doc;
|
30
|
+
}
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/tomoto/ct.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module Tomoto
|
2
|
+
class CT
|
3
|
+
def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, alpha: 0.1, eta: 0.01, seed: nil)
|
4
|
+
model = _new(to_tw(tw), k, alpha, eta, seed || -1)
|
5
|
+
model.instance_variable_set(:@min_cf, min_cf)
|
6
|
+
model.instance_variable_set(:@min_df, min_df)
|
7
|
+
model.instance_variable_set(:@rm_top, rm_top)
|
8
|
+
init_params(model, binding)
|
9
|
+
end
|
10
|
+
|
11
|
+
def correlations(topic_id = nil)
|
12
|
+
prepare
|
13
|
+
if topic_id
|
14
|
+
_correlations(topic_id)
|
15
|
+
else
|
16
|
+
k.times.map { |i| _correlations(i) }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def prior_cov
|
21
|
+
_prior_cov.each_slice(k).to_a
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/tomoto/dmr.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
module Tomoto
|
2
|
+
class DMR
|
3
|
+
def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, alpha: 0.1, eta: 0.01, sigma: 1.0, alpha_epsilon: 1e-10, seed: nil)
|
4
|
+
model = _new(to_tw(tw), k, alpha, sigma, eta, alpha_epsilon, seed || -1)
|
5
|
+
model.instance_variable_set(:@min_cf, min_cf)
|
6
|
+
model.instance_variable_set(:@min_df, min_df)
|
7
|
+
model.instance_variable_set(:@rm_top, rm_top)
|
8
|
+
init_params(model, binding)
|
9
|
+
end
|
10
|
+
|
11
|
+
def add_doc(doc, metadata: "")
|
12
|
+
_add_doc(prepare_doc(doc), metadata)
|
13
|
+
end
|
14
|
+
|
15
|
+
def lambdas
|
16
|
+
if f == 0
|
17
|
+
[]
|
18
|
+
else
|
19
|
+
k.times.map { |i| _lambdas(i) }
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def alpha
|
24
|
+
lambdas.map { |v| v.map { |v2| Math.exp(v2) } }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|