tomoto 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +7 -0
  5. data/ext/tomoto/ct.cpp +54 -0
  6. data/ext/tomoto/dmr.cpp +62 -0
  7. data/ext/tomoto/dt.cpp +82 -0
  8. data/ext/tomoto/ext.cpp +27 -773
  9. data/ext/tomoto/gdmr.cpp +34 -0
  10. data/ext/tomoto/hdp.cpp +42 -0
  11. data/ext/tomoto/hlda.cpp +66 -0
  12. data/ext/tomoto/hpa.cpp +27 -0
  13. data/ext/tomoto/lda.cpp +250 -0
  14. data/ext/tomoto/llda.cpp +29 -0
  15. data/ext/tomoto/mglda.cpp +71 -0
  16. data/ext/tomoto/pa.cpp +27 -0
  17. data/ext/tomoto/plda.cpp +29 -0
  18. data/ext/tomoto/slda.cpp +40 -0
  19. data/ext/tomoto/utils.h +84 -0
  20. data/lib/tomoto/tomoto.bundle +0 -0
  21. data/lib/tomoto/tomoto.so +0 -0
  22. data/lib/tomoto/version.rb +1 -1
  23. data/vendor/tomotopy/README.kr.rst +12 -3
  24. data/vendor/tomotopy/README.rst +12 -3
  25. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
  26. data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
  27. data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
  28. data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
  29. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
  30. data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
  31. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
  32. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
  33. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
  34. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
  35. data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
  36. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
  37. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
  38. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
  39. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
  40. data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
  41. data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
  42. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
  43. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
  44. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
  45. data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
  46. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
  47. data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
  48. data/vendor/tomotopy/src/Utils/math.h +8 -4
  49. data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
  50. metadata +24 -60
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 370ff6a569a6e476137cda0db349cf1b22ff0e7d54b0df67500ffbbe46846a86
4
- data.tar.gz: 8df2fb740a85f8a51485e992dfacd6bef4d3c185d8d08369c72d93b9bb412f6d
3
+ metadata.gz: b1cb95a96851ccd3d499ed38c9da531ce48588cf44c37ccc92bbfdc9277e0962
4
+ data.tar.gz: cadee081b1f0ea9cc37b75afd97e8ecebb32796cc335da2ff50e844c955a0e4a
5
5
  SHA512:
6
- metadata.gz: 17abc53d6923a1fb24bfd040967944884cd7e1595ff4fb82c7af46a4bfae858875b2819b7e05fd2ff92e7fe17eec1350c551aa98bf88eedb344cde00a780f32a
7
- data.tar.gz: 8f504f89679cf072fee04ac5b0319e164a509d140e65dd6f1c77e6fc9ecef252ce4ece0ac80087d0d69ddccf2a5b002910db58bb5212c4c2c7fa5d206b60d422
6
+ metadata.gz: f5154bfb71c0b8891953b97c8edf37a7fb70fcb2ab09c3f51126e14262c729dcdc4b82d2727a8601131e090a05efcd1958851d77b5e8e95b922fc9b1f44cedf6
7
+ data.tar.gz: f975f505493d41bc425e0d288762e97c83ffdb6c1812792bc2dca517c550f1508efef79a24bdde992d7acd3994d6566c27745b9bed806ea64dfa072d22c692a0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.1.4 (2021-03-14)
2
+
3
+ - Added `docs` method
4
+ - Updated tomoto to 0.10.2
5
+ - Updated `add_doc` to return the index of the document
6
+
1
7
  ## 0.1.3 (2020-12-19)
2
8
 
3
9
  - Updated tomoto to 0.10.0
data/LICENSE.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  MIT License
2
2
 
3
3
  Copyright (c) 2019, bab2min
4
- Copyright (c) 2020 Andrew Kane
4
+ Copyright (c) 2020-2021 Andrew Kane
5
5
 
6
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
7
7
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -50,6 +50,13 @@ Load the model from a file
50
50
  model = Tomoto::LDA.load("model.bin")
51
51
  ```
52
52
 
53
+ Get topic probabilities for a document
54
+
55
+ ```ruby
56
+ doc = model.docs[0]
57
+ doc.topics
58
+ ```
59
+
53
60
  Get the number of words for each topic
54
61
 
55
62
  ```ruby
data/ext/tomoto/ct.cpp ADDED
@@ -0,0 +1,54 @@
1
+ #include <CT.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_ct(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::ICTModel, tomoto::ILDAModel>(m, "CT")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ return tomoto::ICTModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
16
+ })
17
+ .define_method(
18
+ "_correlations",
19
+ *[](tomoto::ICTModel& self, tomoto::Tid topic_id) {
20
+ return self.getCorrelationTopic(topic_id);
21
+ })
22
+ .define_method(
23
+ "num_beta_sample",
24
+ *[](tomoto::ICTModel& self) {
25
+ return self.getNumBetaSample();
26
+ })
27
+ .define_method(
28
+ "num_beta_sample=",
29
+ *[](tomoto::ICTModel& self, size_t value) {
30
+ self.setNumBetaSample(value);
31
+ return value;
32
+ })
33
+ .define_method(
34
+ "num_tmn_sample",
35
+ *[](tomoto::ICTModel& self) {
36
+ return self.getNumTMNSample();
37
+ })
38
+ .define_method(
39
+ "num_tmn_sample=",
40
+ *[](tomoto::ICTModel& self, size_t value) {
41
+ self.setNumTMNSample(value);
42
+ return value;
43
+ })
44
+ .define_method(
45
+ "_prior_cov",
46
+ *[](tomoto::ICTModel& self) {
47
+ return self.getPriorCov();
48
+ })
49
+ .define_method(
50
+ "prior_mean",
51
+ *[](tomoto::ICTModel& self) {
52
+ return self.getPriorMean();
53
+ });
54
+ }
@@ -0,0 +1,62 @@
1
+ #include <DMR.h>
2
+
3
+ #include <rice/Class.hpp>
4
+ #include <rice/Module.hpp>
5
+
6
+ #include "utils.h"
7
+
8
+ void init_dmr(Rice::Module& m) {
9
+ Rice::define_class_under<tomoto::IDMRModel, tomoto::ILDAModel>(m, "DMR")
10
+ .define_singleton_method(
11
+ "_new",
12
+ *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float sigma, tomoto::Float eta, tomoto::Float alpha_epsilon, int seed) {
13
+ if (seed < 0) {
14
+ seed = std::random_device{}();
15
+ }
16
+ return tomoto::IDMRModel::create((tomoto::TermWeight)tw, k, alpha, sigma, eta, alpha_epsilon, seed);
17
+ })
18
+ .define_method(
19
+ "_add_doc",
20
+ *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::string metadata) {
21
+ auto doc = buildDoc(words);
22
+ doc.misc["metadata"] = metadata;
23
+ return self.addDoc(doc);
24
+ })
25
+ .define_method(
26
+ "alpha_epsilon",
27
+ *[](tomoto::IDMRModel& self) {
28
+ return self.getAlphaEps();
29
+ })
30
+ .define_method(
31
+ "alpha_epsilon=",
32
+ *[](tomoto::IDMRModel& self, tomoto::Float value) {
33
+ self.setAlphaEps(value);
34
+ return value;
35
+ })
36
+ .define_method(
37
+ "f",
38
+ *[](tomoto::IDMRModel& self) {
39
+ return self.getF();
40
+ })
41
+ .define_method(
42
+ "_lambdas",
43
+ *[](tomoto::IDMRModel& self, tomoto::Tid topic_id) {
44
+ return self.getLambdaByTopic(topic_id);
45
+ })
46
+ .define_method(
47
+ "metadata_dict",
48
+ *[](tomoto::IDMRModel& self) {
49
+ auto dict = self.getMetadataDict();
50
+ Array res;
51
+ auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
52
+ for (size_t i = 0; i < dict.size(); i++) {
53
+ res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
54
+ }
55
+ return res;
56
+ })
57
+ .define_method(
58
+ "sigma",
59
+ *[](tomoto::IDMRModel& self) {
60
+ return self.getSigma();
61
+ });
62
+ }
data/ext/tomoto/dt.cpp ADDED
@@ -0,0 +1,82 @@
1
+ #include <DT.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_dt(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IDTModel, tomoto::ILDAModel>(m, "DT")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k, size_t t, tomoto::Float alphaVar, tomoto::Float etaVar, tomoto::Float phiVar, tomoto::Float shapeA, tomoto::Float shapeB, tomoto::Float shapeC) {
12
+ // Rice only supports 10 arguments
13
+ int seed = -1;
14
+ if (seed < 0) {
15
+ seed = std::random_device{}();
16
+ }
17
+ return tomoto::IDTModel::create((tomoto::TermWeight)tw, k, t, alphaVar, etaVar, phiVar, shapeA, shapeB, shapeC, 0, seed);
18
+ })
19
+ .define_method(
20
+ "_add_doc",
21
+ *[](tomoto::IDTModel& self, std::vector<std::string> words, uint32_t timepoint) {
22
+ auto doc = buildDoc(words);
23
+ doc.misc["timepoint"] = timepoint;
24
+ return self.addDoc(doc);
25
+ })
26
+ .define_method(
27
+ "alpha",
28
+ *[](tomoto::IDTModel& self) {
29
+ Array res;
30
+ for (size_t i = 0; i < self.getK(); i++) {
31
+ Array res2;
32
+ for (size_t j = 0; j < self.getT(); j++) {
33
+ res2.push(self.getAlpha(i, j));
34
+ }
35
+ res.push(res2);
36
+ }
37
+ return res;
38
+ })
39
+ .define_method(
40
+ "lr_a",
41
+ *[](tomoto::IDTModel& self) {
42
+ return self.getShapeA();
43
+ })
44
+ .define_method(
45
+ "lr_a=",
46
+ *[](tomoto::IDTModel& self, tomoto::Float value) {
47
+ self.setShapeA(value);
48
+ return value;
49
+ })
50
+ .define_method(
51
+ "lr_b",
52
+ *[](tomoto::IDTModel& self) {
53
+ return self.getShapeB();
54
+ })
55
+ .define_method(
56
+ "lr_b=",
57
+ *[](tomoto::IDTModel& self, tomoto::Float value) {
58
+ self.setShapeB(value);
59
+ return value;
60
+ })
61
+ .define_method(
62
+ "lr_c",
63
+ *[](tomoto::IDTModel& self) {
64
+ return self.getShapeC();
65
+ })
66
+ .define_method(
67
+ "lr_c=",
68
+ *[](tomoto::IDTModel& self, tomoto::Float value) {
69
+ self.setShapeC(value);
70
+ return value;
71
+ })
72
+ .define_method(
73
+ "num_docs_by_timepoint",
74
+ *[](tomoto::IDTModel& self) {
75
+ return self.getNumDocsByT();
76
+ })
77
+ .define_method(
78
+ "num_timepoints",
79
+ *[](tomoto::IDTModel& self) {
80
+ return self.getT();
81
+ });
82
+ }
data/ext/tomoto/ext.cpp CHANGED
@@ -1,111 +1,23 @@
1
- // stdlib
2
- #include <fstream>
3
- #include <iostream>
4
-
5
- // tomoto
6
- #include <CT.h>
7
- #include <DMR.h>
8
- #include <DT.h>
9
- #include <GDMR.h>
10
- #include <HDP.h>
11
- #include <HLDA.h>
12
- #include <HPA.h>
13
- #include <LDA.h>
14
- #include <LLDA.h>
15
- #include <MGLDA.h>
16
- #include <PA.h>
17
- #include <PLDA.h>
18
- #include <SLDA.h>
19
-
20
- // rice
21
- #include <rice/Array.hpp>
22
- #include <rice/Hash.hpp>
23
1
  #include <rice/Module.hpp>
24
2
 
25
- using Rice::Array;
26
- using Rice::Class;
27
- using Rice::Hash;
28
- using Rice::Module;
29
- using Rice::Object;
30
- using Rice::define_class_under;
31
- using Rice::define_module;
32
-
33
- template<>
34
- Object to_ruby<std::vector<tomoto::Float>>(std::vector<tomoto::Float> const & x)
35
- {
36
- Array res;
37
- for (auto const& v : x) {
38
- res.push(v);
39
- }
40
- return res;
41
- }
42
-
43
- template<>
44
- Object to_ruby<std::vector<uint32_t>>(std::vector<uint32_t> const & x)
45
- {
46
- Array res;
47
- for (auto const& v : x) {
48
- res.push(v);
49
- }
50
- return res;
51
- }
52
-
53
- template<>
54
- Object to_ruby<std::vector<uint64_t>>(std::vector<uint64_t> const & x)
55
- {
56
- Array res;
57
- for (auto const& v : x) {
58
- res.push(v);
59
- }
60
- return res;
61
- }
62
-
63
- template<>
64
- std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
65
- {
66
- Array a = Array(x);
67
- std::vector<std::string> res;
68
- res.reserve(a.size());
69
- for (auto const& v : a) {
70
- res.push_back(from_ruby<std::string>(v));
71
- }
72
- return res;
73
- }
74
-
75
- template<>
76
- std::vector<tomoto::Float> from_ruby<std::vector<tomoto::Float>>(Object x)
77
- {
78
- Array a = Array(x);
79
- std::vector<tomoto::Float> res;
80
- res.reserve(a.size());
81
- for (auto const& v : a) {
82
- res.push_back(from_ruby<tomoto::Float>(v));
83
- }
84
- return res;
85
- }
86
-
87
- template<>
88
- std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
89
- {
90
- Array a = Array(x);
91
- std::vector<uint64_t> res;
92
- res.reserve(a.size());
93
- for (auto const& v : a) {
94
- res.push_back(from_ruby<uint64_t>(v));
95
- }
96
- return res;
97
- }
98
-
99
- tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
100
- tomoto::RawDoc doc;
101
- doc.rawWords = words;
102
- return doc;
103
- }
3
+ void init_lda(Rice::Module& m);
4
+ void init_ct(Rice::Module& m);
5
+ void init_dmr(Rice::Module& m);
6
+ void init_dt(Rice::Module& m);
7
+ void init_gdmr(Rice::Module& m);
8
+ void init_hdp(Rice::Module& m);
9
+ void init_hlda(Rice::Module& m);
10
+ void init_pa(Rice::Module& m);
11
+ void init_hpa(Rice::Module& m);
12
+ void init_mglda(Rice::Module& m);
13
+ void init_llda(Rice::Module& m);
14
+ void init_plda(Rice::Module& m);
15
+ void init_slda(Rice::Module& m);
104
16
 
105
17
  extern "C"
106
18
  void Init_ext()
107
19
  {
108
- Module rb_mTomoto = define_module("Tomoto")
20
+ auto m = Rice::define_module("Tomoto")
109
21
  .define_singleton_method(
110
22
  "isa",
111
23
  *[]() {
@@ -120,675 +32,17 @@ void Init_ext()
120
32
  #endif
121
33
  });
122
34
 
123
- Class rb_cLDA = define_class_under<tomoto::ILDAModel>(rb_mTomoto, "LDA")
124
- .define_singleton_method(
125
- "_new",
126
- *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
127
- if (seed < 0) {
128
- seed = std::random_device{}();
129
- }
130
- return tomoto::ILDAModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
131
- })
132
- .define_method(
133
- "_add_doc",
134
- *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
135
- self.addDoc(buildDoc(words));
136
- })
137
- .define_method(
138
- "alpha",
139
- *[](tomoto::ILDAModel& self) {
140
- Array res;
141
- for (size_t i = 0; i < self.getK(); i++) {
142
- res.push(self.getAlpha(i));
143
- }
144
- return res;
145
- })
146
- .define_method(
147
- "burn_in",
148
- *[](tomoto::ILDAModel& self) {
149
- return self.getBurnInIteration();
150
- })
151
- .define_method(
152
- "burn_in=",
153
- *[](tomoto::ILDAModel& self, size_t iteration) {
154
- self.setBurnInIteration(iteration);
155
- return iteration;
156
- })
157
- .define_method(
158
- "_count_by_topics",
159
- *[](tomoto::ILDAModel& self) {
160
- Array res;
161
- for (auto const& v : self.getCountByTopic()) {
162
- res.push(v);
163
- }
164
- return res;
165
- })
166
- .define_method(
167
- "eta",
168
- *[](tomoto::ILDAModel& self) {
169
- return self.getEta();
170
- })
171
- .define_method(
172
- "global_step",
173
- *[](tomoto::ILDAModel& self) {
174
- return self.getGlobalStep();
175
- })
176
- .define_method(
177
- "k",
178
- *[](tomoto::ILDAModel& self) {
179
- return self.getK();
180
- })
181
- .define_method(
182
- "_load",
183
- *[](tomoto::ILDAModel& self, const char* filename) {
184
- std::ifstream str{ filename, std::ios_base::binary };
185
- if (!str) throw std::runtime_error{ std::string("cannot open file '") + filename + std::string("'") };
186
- std::vector<uint8_t> extra_data;
187
- self.loadModel(str, &extra_data);
188
- })
189
- .define_method(
190
- "ll_per_word",
191
- *[](tomoto::ILDAModel& self) {
192
- return self.getLLPerWord();
193
- })
194
- .define_method(
195
- "num_docs",
196
- *[](tomoto::ILDAModel& self) {
197
- return self.getNumDocs();
198
- })
199
- .define_method(
200
- "num_vocabs",
201
- *[](tomoto::ILDAModel& self) {
202
- return self.getV();
203
- })
204
- .define_method(
205
- "num_words",
206
- *[](tomoto::ILDAModel& self) {
207
- return self.getN();
208
- })
209
- .define_method(
210
- "optim_interval",
211
- *[](tomoto::ILDAModel& self) {
212
- return self.getOptimInterval();
213
- })
214
- .define_method(
215
- "optim_interval=",
216
- *[](tomoto::ILDAModel& self, size_t value) {
217
- self.setOptimInterval(value);
218
- return value;
219
- })
220
- .define_method(
221
- "perplexity",
222
- *[](tomoto::ILDAModel& self) {
223
- return self.getPerplexity();
224
- })
225
- .define_method(
226
- "_prepare",
227
- *[](tomoto::ILDAModel& self, size_t minCnt, size_t minDf, size_t rmTop) {
228
- self.prepare(true, minCnt, minDf, rmTop);
229
- })
230
- .define_method(
231
- "_removed_top_words",
232
- *[](tomoto::ILDAModel& self, size_t rmTop) {
233
- Array res;
234
- auto dict = self.getVocabDict();
235
- size_t size = dict.size();
236
- for (size_t i = rmTop; i > 0; i--) {
237
- res.push(dict.toWord(size - i));
238
- }
239
- return res;
240
- })
241
- .define_method(
242
- "_save",
243
- *[](tomoto::ILDAModel& self, const char* filename, bool full) {
244
- std::ofstream str{ filename, std::ios_base::binary };
245
- std::vector<uint8_t> extra_data;
246
- self.saveModel(str, full, &extra_data);
247
- })
248
- .define_method(
249
- "_topic_words",
250
- *[](tomoto::ILDAModel& self, size_t topicId, size_t topN) {
251
- Hash res;
252
- for (auto const& v : self.getWordsByTopicSorted(topicId, topN)) {
253
- res[v.first] = v.second;
254
- }
255
- return res;
256
- })
257
- .define_method(
258
- "_train",
259
- *[](tomoto::ILDAModel& self, size_t iteration, size_t workers, size_t ps) {
260
- self.train(iteration, workers, (tomoto::ParallelScheme)ps);
261
- })
262
- .define_method(
263
- "_tw",
264
- *[](tomoto::ILDAModel& self) {
265
- return (int)self.getTermWeight();
266
- })
267
- .define_method(
268
- "used_vocab_df",
269
- *[](tomoto::ILDAModel& self) {
270
- auto vocab = self.getVocabDf();
271
- Array res;
272
- for (size_t i = 0; i < self.getV(); i++) {
273
- res.push(vocab[i]);
274
- }
275
- return res;
276
- })
277
- .define_method(
278
- "used_vocab_freq",
279
- *[](tomoto::ILDAModel& self) {
280
- auto vocab = self.getVocabCf();
281
- Array res;
282
- for (size_t i = 0; i < self.getV(); i++) {
283
- res.push(vocab[i]);
284
- }
285
- return res;
286
- })
287
- .define_method(
288
- "used_vocabs",
289
- *[](tomoto::ILDAModel& self) {
290
- auto dict = self.getVocabDict();
291
- Array res;
292
- auto utf8 = Class(rb_cEncoding).call("const_get", "UTF_8");
293
- for (size_t i = 0; i < self.getV(); i++) {
294
- res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
295
- }
296
- return res;
297
- })
298
- .define_method(
299
- "vocab_df",
300
- *[](tomoto::ILDAModel& self) {
301
- auto vocab = self.getVocabDf();
302
- Array res;
303
- for (size_t i = 0; i < vocab.size(); i++) {
304
- res.push(vocab[i]);
305
- }
306
- return res;
307
- })
308
- .define_method(
309
- "vocab_freq",
310
- *[](tomoto::ILDAModel& self) {
311
- auto vocab = self.getVocabCf();
312
- Array res;
313
- for (size_t i = 0; i < vocab.size(); i++) {
314
- res.push(vocab[i]);
315
- }
316
- return res;
317
- })
318
- .define_method(
319
- "vocabs",
320
- *[](tomoto::ILDAModel& self) {
321
- auto dict = self.getVocabDict();
322
- Array res;
323
- auto utf8 = Class(rb_cEncoding).call("const_get", "UTF_8");
324
- for (size_t i = 0; i < dict.size(); i++) {
325
- res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
326
- }
327
- return res;
328
- });
329
-
330
- Class rb_cCT = define_class_under<tomoto::ICTModel, tomoto::ILDAModel>(rb_mTomoto, "CT")
331
- .define_singleton_method(
332
- "_new",
333
- *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
334
- if (seed < 0) {
335
- seed = std::random_device{}();
336
- }
337
- return tomoto::ICTModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
338
- })
339
- .define_method(
340
- "_correlations",
341
- *[](tomoto::ICTModel& self, tomoto::Tid topic_id) {
342
- return self.getCorrelationTopic(topic_id);
343
- })
344
- .define_method(
345
- "num_beta_sample",
346
- *[](tomoto::ICTModel& self) {
347
- return self.getNumBetaSample();
348
- })
349
- .define_method(
350
- "num_beta_sample=",
351
- *[](tomoto::ICTModel& self, size_t value) {
352
- self.setNumBetaSample(value);
353
- return value;
354
- })
355
- .define_method(
356
- "num_tmn_sample",
357
- *[](tomoto::ICTModel& self) {
358
- return self.getNumTMNSample();
359
- })
360
- .define_method(
361
- "num_tmn_sample=",
362
- *[](tomoto::ICTModel& self, size_t value) {
363
- self.setNumTMNSample(value);
364
- return value;
365
- })
366
- .define_method(
367
- "_prior_cov",
368
- *[](tomoto::ICTModel& self) {
369
- return self.getPriorCov();
370
- })
371
- .define_method(
372
- "prior_mean",
373
- *[](tomoto::ICTModel& self) {
374
- return self.getPriorMean();
375
- });
376
-
377
- Class rb_cDMR = define_class_under<tomoto::IDMRModel, tomoto::ILDAModel>(rb_mTomoto, "DMR")
378
- .define_singleton_method(
379
- "_new",
380
- *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float sigma, tomoto::Float eta, tomoto::Float alpha_epsilon, int seed) {
381
- if (seed < 0) {
382
- seed = std::random_device{}();
383
- }
384
- return tomoto::IDMRModel::create((tomoto::TermWeight)tw, k, alpha, sigma, eta, alpha_epsilon, seed);
385
- })
386
- .define_method(
387
- "_add_doc",
388
- *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::string metadata) {
389
- auto doc = buildDoc(words);
390
- doc.misc["metadata"] = metadata;
391
- self.addDoc(doc);
392
- })
393
- .define_method(
394
- "alpha_epsilon",
395
- *[](tomoto::IDMRModel& self) {
396
- return self.getAlphaEps();
397
- })
398
- .define_method(
399
- "alpha_epsilon=",
400
- *[](tomoto::IDMRModel& self, tomoto::Float value) {
401
- self.setAlphaEps(value);
402
- return value;
403
- })
404
- .define_method(
405
- "f",
406
- *[](tomoto::IDMRModel& self) {
407
- return self.getF();
408
- })
409
- .define_method(
410
- "_lambdas",
411
- *[](tomoto::IDMRModel& self, tomoto::Tid topic_id) {
412
- return self.getLambdaByTopic(topic_id);
413
- })
414
- .define_method(
415
- "metadata_dict",
416
- *[](tomoto::IDMRModel& self) {
417
- auto dict = self.getMetadataDict();
418
- Array res;
419
- auto utf8 = Class(rb_cEncoding).call("const_get", "UTF_8");
420
- for (size_t i = 0; i < dict.size(); i++) {
421
- res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
422
- }
423
- return res;
424
- })
425
- .define_method(
426
- "sigma",
427
- *[](tomoto::IDMRModel& self) {
428
- return self.getSigma();
429
- });
430
-
431
- Class rb_cDT = define_class_under<tomoto::IDTModel, tomoto::ILDAModel>(rb_mTomoto, "DT")
432
- .define_singleton_method(
433
- "_new",
434
- *[](size_t tw, size_t k, size_t t, tomoto::Float alphaVar, tomoto::Float etaVar, tomoto::Float phiVar, tomoto::Float shapeA, tomoto::Float shapeB, tomoto::Float shapeC) {
435
- // Rice only supports 10 arguments
436
- int seed = -1;
437
- if (seed < 0) {
438
- seed = std::random_device{}();
439
- }
440
- return tomoto::IDTModel::create((tomoto::TermWeight)tw, k, t, alphaVar, etaVar, phiVar, shapeA, shapeB, shapeC, 0, seed);
441
- })
442
- .define_method(
443
- "_add_doc",
444
- *[](tomoto::IDTModel& self, std::vector<std::string> words, uint32_t timepoint) {
445
- auto doc = buildDoc(words);
446
- doc.misc["timepoint"] = timepoint;
447
- self.addDoc(doc);
448
- })
449
- .define_method(
450
- "lr_a",
451
- *[](tomoto::IDTModel& self) {
452
- return self.getShapeA();
453
- })
454
- .define_method(
455
- "lr_a=",
456
- *[](tomoto::IDTModel& self, tomoto::Float value) {
457
- self.setShapeA(value);
458
- return value;
459
- })
460
- .define_method(
461
- "lr_b",
462
- *[](tomoto::IDTModel& self) {
463
- return self.getShapeB();
464
- })
465
- .define_method(
466
- "lr_b=",
467
- *[](tomoto::IDTModel& self, tomoto::Float value) {
468
- self.setShapeB(value);
469
- return value;
470
- })
471
- .define_method(
472
- "lr_c",
473
- *[](tomoto::IDTModel& self) {
474
- return self.getShapeC();
475
- })
476
- .define_method(
477
- "lr_c=",
478
- *[](tomoto::IDTModel& self, tomoto::Float value) {
479
- self.setShapeC(value);
480
- return value;
481
- })
482
- .define_method(
483
- "num_docs_by_timepoint",
484
- *[](tomoto::IDTModel& self) {
485
- return self.getNumDocsByT();
486
- })
487
- .define_method(
488
- "num_timepoints",
489
- *[](tomoto::IDTModel& self) {
490
- return self.getT();
491
- });
492
-
493
- Class rb_cGDMR = define_class_under<tomoto::IGDMRModel, tomoto::IDMRModel>(rb_mTomoto, "GDMR")
494
- .define_singleton_method(
495
- "_new",
496
- *[](size_t tw, size_t k, std::vector<uint64_t> degrees, tomoto::Float alpha, tomoto::Float sigma, tomoto::Float sigma0, tomoto::Float eta, tomoto::Float alpha_epsilon, int seed) {
497
- if (seed < 0) {
498
- seed = std::random_device{}();
499
- }
500
- return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
501
- })
502
- .define_method(
503
- "_add_doc",
504
- *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<tomoto::Float> metadata) {
505
- auto doc = buildDoc(words);
506
- doc.misc["metadata"] = metadata;
507
- self.addDoc(doc);
508
- })
509
- .define_method(
510
- "degrees",
511
- *[](tomoto::IGDMRModel& self) {
512
- return self.getFs();
513
- })
514
- .define_method(
515
- "sigma0",
516
- *[](tomoto::IGDMRModel& self) {
517
- return self.getSigma0();
518
- });
519
-
520
- Class rb_cHDP = define_class_under<tomoto::IHDPModel, tomoto::ILDAModel>(rb_mTomoto, "HDP")
521
- .define_singleton_method(
522
- "_new",
523
- *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, tomoto::Float gamma, int seed) {
524
- if (seed < 0) {
525
- seed = std::random_device{}();
526
- }
527
- return tomoto::IHDPModel::create((tomoto::TermWeight)tw, k, alpha, eta, gamma, seed);
528
- })
529
- .define_method(
530
- "alpha",
531
- *[](tomoto::IHDPModel& self) {
532
- return self.getAlpha();
533
- })
534
- .define_method(
535
- "gamma",
536
- *[](tomoto::IHDPModel& self) {
537
- return self.getGamma();
538
- })
539
- .define_method(
540
- "live_k",
541
- *[](tomoto::IHDPModel& self) {
542
- return self.getLiveK();
543
- })
544
- .define_method(
545
- "live_topic?",
546
- *[](tomoto::IHDPModel& self, size_t tid) {
547
- return self.isLiveTopic(tid);
548
- })
549
- .define_method(
550
- "num_tables",
551
- *[](tomoto::IHDPModel& self) {
552
- return self.getTotalTables();
553
- });
554
-
555
- Class rb_cHLDA = define_class_under<tomoto::IHLDAModel, tomoto::ILDAModel>(rb_mTomoto, "HLDA")
556
- .define_singleton_method(
557
- "_new",
558
- *[](size_t tw, size_t levelDepth, tomoto::Float alpha, tomoto::Float eta, tomoto::Float gamma, int seed) {
559
- if (seed < 0) {
560
- seed = std::random_device{}();
561
- }
562
- return tomoto::IHLDAModel::create((tomoto::TermWeight)tw, levelDepth, alpha, eta, gamma, seed);
563
- })
564
- .define_method(
565
- "alpha",
566
- *[](tomoto::IHLDAModel& self) {
567
- Array res;
568
- for (size_t i = 0; i < self.getLevelDepth(); i++) {
569
- res.push(self.getAlpha(i));
570
- }
571
- return res;
572
- })
573
- .define_method(
574
- "_children_topics",
575
- *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
576
- return self.getChildTopicId(topic_id);
577
- })
578
- .define_method(
579
- "depth",
580
- *[](tomoto::IHLDAModel& self) {
581
- return self.getLevelDepth();
582
- })
583
- .define_method(
584
- "gamma",
585
- *[](tomoto::IHLDAModel& self) {
586
- return self.getGamma();
587
- })
588
- .define_method(
589
- "_level",
590
- *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
591
- return self.getLevelOfTopic(topic_id);
592
- })
593
- .define_method(
594
- "live_k",
595
- *[](tomoto::IHLDAModel& self) {
596
- return self.getLiveK();
597
- })
598
- .define_method(
599
- "_live_topic?",
600
- *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
601
- return self.isLiveTopic(topic_id);
602
- })
603
- .define_method(
604
- "_num_docs_of_topic",
605
- *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
606
- return self.getNumDocsOfTopic(topic_id);
607
- })
608
- .define_method(
609
- "_parent_topic",
610
- *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
611
- return self.getParentTopicId(topic_id);
612
- });
613
-
614
- Class rb_cPA = define_class_under<tomoto::IPAModel, tomoto::ILDAModel>(rb_mTomoto, "PA")
615
- .define_singleton_method(
616
- "_new",
617
- *[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, int seed) {
618
- if (seed < 0) {
619
- seed = std::random_device{}();
620
- }
621
- return tomoto::IPAModel::create((tomoto::TermWeight)tw, k1, k2, alpha, eta, seed);
622
- })
623
- .define_method(
624
- "k1",
625
- *[](tomoto::IPAModel& self) {
626
- return self.getK();
627
- })
628
- .define_method(
629
- "k2",
630
- *[](tomoto::IPAModel& self) {
631
- return self.getK2();
632
- });
633
-
634
- Class rb_cHPA = define_class_under<tomoto::IHPAModel, tomoto::IPAModel>(rb_mTomoto, "HPA")
635
- .define_singleton_method(
636
- "_new",
637
- *[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, int seed) {
638
- if (seed < 0) {
639
- seed = std::random_device{}();
640
- }
641
- return tomoto::IHPAModel::create((tomoto::TermWeight)tw, false, k1, k2, alpha, eta, seed);
642
- })
643
- .define_method(
644
- "alpha",
645
- *[](tomoto::IHPAModel& self) {
646
- Array res;
647
- // use <= to return k+1 elements
648
- for (size_t i = 0; i <= self.getK(); i++) {
649
- res.push(self.getAlpha(i));
650
- }
651
- return res;
652
- });
653
-
654
- Class rb_cMGLDA = define_class_under<tomoto::IMGLDAModel, tomoto::ILDAModel>(rb_mTomoto, "MGLDA")
655
- .define_singleton_method(
656
- "_new",
657
- *[](size_t tw, size_t k_g, size_t k_l, size_t t, tomoto::Float alpha_g, tomoto::Float alpha_l, tomoto::Float alpha_mg, tomoto::Float alpha_ml, tomoto::Float eta_g) {
658
- return tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g);
659
- })
660
- .define_method(
661
- "_add_doc",
662
- *[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
663
- auto doc = buildDoc(words);
664
- doc.misc["delimiter"] = delimiter;
665
- self.addDoc(doc);
666
- })
667
- .define_method(
668
- "alpha_g",
669
- *[](tomoto::IMGLDAModel& self) {
670
- return self.getAlpha();
671
- })
672
- .define_method(
673
- "alpha_l",
674
- *[](tomoto::IMGLDAModel& self) {
675
- return self.getAlphaL();
676
- })
677
- .define_method(
678
- "alpha_mg",
679
- *[](tomoto::IMGLDAModel& self) {
680
- return self.getAlphaM();
681
- })
682
- .define_method(
683
- "alpha_ml",
684
- *[](tomoto::IMGLDAModel& self) {
685
- return self.getAlphaML();
686
- })
687
- .define_method(
688
- "eta_g",
689
- *[](tomoto::IMGLDAModel& self) {
690
- return self.getEta();
691
- })
692
- .define_method(
693
- "eta_l",
694
- *[](tomoto::IMGLDAModel& self) {
695
- return self.getEtaL();
696
- })
697
- .define_method(
698
- "gamma",
699
- *[](tomoto::IMGLDAModel& self) {
700
- return self.getGamma();
701
- })
702
- .define_method(
703
- "k_g",
704
- *[](tomoto::IMGLDAModel& self) {
705
- return self.getK();
706
- })
707
- .define_method(
708
- "k_l",
709
- *[](tomoto::IMGLDAModel& self) {
710
- return self.getKL();
711
- })
712
- .define_method(
713
- "t",
714
- *[](tomoto::IMGLDAModel& self) {
715
- return self.getT();
716
- });
717
-
718
- Class rb_cLLDA = define_class_under<tomoto::ILLDAModel, tomoto::ILDAModel>(rb_mTomoto, "LLDA")
719
- .define_singleton_method(
720
- "_new",
721
- *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
722
- if (seed < 0) {
723
- seed = std::random_device{}();
724
- }
725
- return tomoto::ILLDAModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
726
- })
727
- .define_method(
728
- "_add_doc",
729
- *[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
730
- auto doc = buildDoc(words);
731
- doc.misc["labels"] = labels;
732
- self.addDoc(doc);
733
- })
734
- .define_method(
735
- "topics_per_label",
736
- *[](tomoto::ILLDAModel& self) {
737
- return self.getNumTopicsPerLabel();
738
- });
739
-
740
- Class rb_cPLDA = define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(rb_mTomoto, "PLDA")
741
- .define_singleton_method(
742
- "_new",
743
- *[](size_t tw, size_t latent_topics, tomoto::Float alpha, tomoto::Float eta, int seed) {
744
- if (seed < 0) {
745
- seed = std::random_device{}();
746
- }
747
- return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, latent_topics, 1, alpha, eta, seed);
748
- })
749
- .define_method(
750
- "_add_doc",
751
- *[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
752
- auto doc = buildDoc(words);
753
- doc.misc["labels"] = labels;
754
- self.addDoc(doc);
755
- })
756
- .define_method(
757
- "latent_topics",
758
- *[](tomoto::IPLDAModel& self) {
759
- return self.getNumLatentTopics();
760
- });
761
-
762
- Class rb_cSLDA = define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(rb_mTomoto, "SLDA")
763
- .define_singleton_method(
764
- "_new",
765
- *[](size_t tw, size_t k, Array rb_vars, tomoto::Float alpha, tomoto::Float eta, std::vector<tomoto::Float> mu, std::vector<tomoto::Float> nu_sq, std::vector<tomoto::Float> glm_param, int seed) {
766
- if (seed < 0) {
767
- seed = std::random_device{}();
768
- }
769
- std::vector<tomoto::ISLDAModel::GLM> vars;
770
- vars.reserve(rb_vars.size());
771
- for (auto const& v : rb_vars) {
772
- vars.push_back((tomoto::ISLDAModel::GLM) from_ruby<int>(v));
773
- }
774
- return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, k, vars, alpha, eta, mu, nu_sq, glm_param, seed);
775
- })
776
- .define_method(
777
- "_add_doc",
778
- *[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
779
- auto doc = buildDoc(words);
780
- doc.misc["y"] = y;
781
- self.addDoc(doc);
782
- })
783
- .define_method(
784
- "f",
785
- *[](tomoto::ISLDAModel& self) {
786
- return self.getF();
787
- })
788
- .define_method(
789
- "_var_type",
790
- *[](tomoto::ISLDAModel& self, size_t var_id) {
791
- if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
792
- return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
793
- });
35
+ init_lda(m);
36
+ init_ct(m);
37
+ init_dmr(m);
38
+ init_dt(m);
39
+ init_gdmr(m);
40
+ init_hdp(m);
41
+ init_hlda(m);
42
+ init_pa(m);
43
+ init_hpa(m);
44
+ init_mglda(m);
45
+ init_llda(m);
46
+ init_plda(m);
47
+ init_slda(m);
794
48
  }