tomoto 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +7 -0
  5. data/ext/tomoto/ct.cpp +54 -0
  6. data/ext/tomoto/dmr.cpp +62 -0
  7. data/ext/tomoto/dt.cpp +82 -0
  8. data/ext/tomoto/ext.cpp +27 -773
  9. data/ext/tomoto/gdmr.cpp +34 -0
  10. data/ext/tomoto/hdp.cpp +42 -0
  11. data/ext/tomoto/hlda.cpp +66 -0
  12. data/ext/tomoto/hpa.cpp +27 -0
  13. data/ext/tomoto/lda.cpp +250 -0
  14. data/ext/tomoto/llda.cpp +29 -0
  15. data/ext/tomoto/mglda.cpp +71 -0
  16. data/ext/tomoto/pa.cpp +27 -0
  17. data/ext/tomoto/plda.cpp +29 -0
  18. data/ext/tomoto/slda.cpp +40 -0
  19. data/ext/tomoto/utils.h +84 -0
  20. data/lib/tomoto/tomoto.bundle +0 -0
  21. data/lib/tomoto/tomoto.so +0 -0
  22. data/lib/tomoto/version.rb +1 -1
  23. data/vendor/tomotopy/README.kr.rst +12 -3
  24. data/vendor/tomotopy/README.rst +12 -3
  25. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
  26. data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
  27. data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
  28. data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
  29. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
  30. data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
  31. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
  32. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
  33. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
  34. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
  35. data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
  36. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
  37. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
  38. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
  39. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
  40. data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
  41. data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
  42. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
  43. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
  44. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
  45. data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
  46. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
  47. data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
  48. data/vendor/tomotopy/src/Utils/math.h +8 -4
  49. data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
  50. metadata +24 -60
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 370ff6a569a6e476137cda0db349cf1b22ff0e7d54b0df67500ffbbe46846a86
4
- data.tar.gz: 8df2fb740a85f8a51485e992dfacd6bef4d3c185d8d08369c72d93b9bb412f6d
3
+ metadata.gz: b1cb95a96851ccd3d499ed38c9da531ce48588cf44c37ccc92bbfdc9277e0962
4
+ data.tar.gz: cadee081b1f0ea9cc37b75afd97e8ecebb32796cc335da2ff50e844c955a0e4a
5
5
  SHA512:
6
- metadata.gz: 17abc53d6923a1fb24bfd040967944884cd7e1595ff4fb82c7af46a4bfae858875b2819b7e05fd2ff92e7fe17eec1350c551aa98bf88eedb344cde00a780f32a
7
- data.tar.gz: 8f504f89679cf072fee04ac5b0319e164a509d140e65dd6f1c77e6fc9ecef252ce4ece0ac80087d0d69ddccf2a5b002910db58bb5212c4c2c7fa5d206b60d422
6
+ metadata.gz: f5154bfb71c0b8891953b97c8edf37a7fb70fcb2ab09c3f51126e14262c729dcdc4b82d2727a8601131e090a05efcd1958851d77b5e8e95b922fc9b1f44cedf6
7
+ data.tar.gz: f975f505493d41bc425e0d288762e97c83ffdb6c1812792bc2dca517c550f1508efef79a24bdde992d7acd3994d6566c27745b9bed806ea64dfa072d22c692a0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.1.4 (2021-03-14)
2
+
3
+ - Added `docs` method
4
+ - Updated tomoto to 0.10.2
5
+ - Updated `add_doc` to return the index of the document
6
+
1
7
  ## 0.1.3 (2020-12-19)
2
8
 
3
9
  - Updated tomoto to 0.10.0
data/LICENSE.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  MIT License
2
2
 
3
3
  Copyright (c) 2019, bab2min
4
- Copyright (c) 2020 Andrew Kane
4
+ Copyright (c) 2020-2021 Andrew Kane
5
5
 
6
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
7
7
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -50,6 +50,13 @@ Load the model from a file
50
50
  model = Tomoto::LDA.load("model.bin")
51
51
  ```
52
52
 
53
+ Get topic probabilities for a document
54
+
55
+ ```ruby
56
+ doc = model.docs[0]
57
+ doc.topics
58
+ ```
59
+
53
60
  Get the number of words for each topic
54
61
 
55
62
  ```ruby
data/ext/tomoto/ct.cpp ADDED
@@ -0,0 +1,54 @@
1
+ #include <CT.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_ct(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::ICTModel, tomoto::ILDAModel>(m, "CT")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ return tomoto::ICTModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
16
+ })
17
+ .define_method(
18
+ "_correlations",
19
+ *[](tomoto::ICTModel& self, tomoto::Tid topic_id) {
20
+ return self.getCorrelationTopic(topic_id);
21
+ })
22
+ .define_method(
23
+ "num_beta_sample",
24
+ *[](tomoto::ICTModel& self) {
25
+ return self.getNumBetaSample();
26
+ })
27
+ .define_method(
28
+ "num_beta_sample=",
29
+ *[](tomoto::ICTModel& self, size_t value) {
30
+ self.setNumBetaSample(value);
31
+ return value;
32
+ })
33
+ .define_method(
34
+ "num_tmn_sample",
35
+ *[](tomoto::ICTModel& self) {
36
+ return self.getNumTMNSample();
37
+ })
38
+ .define_method(
39
+ "num_tmn_sample=",
40
+ *[](tomoto::ICTModel& self, size_t value) {
41
+ self.setNumTMNSample(value);
42
+ return value;
43
+ })
44
+ .define_method(
45
+ "_prior_cov",
46
+ *[](tomoto::ICTModel& self) {
47
+ return self.getPriorCov();
48
+ })
49
+ .define_method(
50
+ "prior_mean",
51
+ *[](tomoto::ICTModel& self) {
52
+ return self.getPriorMean();
53
+ });
54
+ }
@@ -0,0 +1,62 @@
1
+ #include <DMR.h>
2
+
3
+ #include <rice/Class.hpp>
4
+ #include <rice/Module.hpp>
5
+
6
+ #include "utils.h"
7
+
8
+ void init_dmr(Rice::Module& m) {
9
+ Rice::define_class_under<tomoto::IDMRModel, tomoto::ILDAModel>(m, "DMR")
10
+ .define_singleton_method(
11
+ "_new",
12
+ *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float sigma, tomoto::Float eta, tomoto::Float alpha_epsilon, int seed) {
13
+ if (seed < 0) {
14
+ seed = std::random_device{}();
15
+ }
16
+ return tomoto::IDMRModel::create((tomoto::TermWeight)tw, k, alpha, sigma, eta, alpha_epsilon, seed);
17
+ })
18
+ .define_method(
19
+ "_add_doc",
20
+ *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::string metadata) {
21
+ auto doc = buildDoc(words);
22
+ doc.misc["metadata"] = metadata;
23
+ return self.addDoc(doc);
24
+ })
25
+ .define_method(
26
+ "alpha_epsilon",
27
+ *[](tomoto::IDMRModel& self) {
28
+ return self.getAlphaEps();
29
+ })
30
+ .define_method(
31
+ "alpha_epsilon=",
32
+ *[](tomoto::IDMRModel& self, tomoto::Float value) {
33
+ self.setAlphaEps(value);
34
+ return value;
35
+ })
36
+ .define_method(
37
+ "f",
38
+ *[](tomoto::IDMRModel& self) {
39
+ return self.getF();
40
+ })
41
+ .define_method(
42
+ "_lambdas",
43
+ *[](tomoto::IDMRModel& self, tomoto::Tid topic_id) {
44
+ return self.getLambdaByTopic(topic_id);
45
+ })
46
+ .define_method(
47
+ "metadata_dict",
48
+ *[](tomoto::IDMRModel& self) {
49
+ auto dict = self.getMetadataDict();
50
+ Array res;
51
+ auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
52
+ for (size_t i = 0; i < dict.size(); i++) {
53
+ res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
54
+ }
55
+ return res;
56
+ })
57
+ .define_method(
58
+ "sigma",
59
+ *[](tomoto::IDMRModel& self) {
60
+ return self.getSigma();
61
+ });
62
+ }
data/ext/tomoto/dt.cpp ADDED
@@ -0,0 +1,82 @@
1
+ #include <DT.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_dt(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IDTModel, tomoto::ILDAModel>(m, "DT")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k, size_t t, tomoto::Float alphaVar, tomoto::Float etaVar, tomoto::Float phiVar, tomoto::Float shapeA, tomoto::Float shapeB, tomoto::Float shapeC) {
12
+ // Rice only supports 10 arguments
13
+ int seed = -1;
14
+ if (seed < 0) {
15
+ seed = std::random_device{}();
16
+ }
17
+ return tomoto::IDTModel::create((tomoto::TermWeight)tw, k, t, alphaVar, etaVar, phiVar, shapeA, shapeB, shapeC, 0, seed);
18
+ })
19
+ .define_method(
20
+ "_add_doc",
21
+ *[](tomoto::IDTModel& self, std::vector<std::string> words, uint32_t timepoint) {
22
+ auto doc = buildDoc(words);
23
+ doc.misc["timepoint"] = timepoint;
24
+ return self.addDoc(doc);
25
+ })
26
+ .define_method(
27
+ "alpha",
28
+ *[](tomoto::IDTModel& self) {
29
+ Array res;
30
+ for (size_t i = 0; i < self.getK(); i++) {
31
+ Array res2;
32
+ for (size_t j = 0; j < self.getT(); j++) {
33
+ res2.push(self.getAlpha(i, j));
34
+ }
35
+ res.push(res2);
36
+ }
37
+ return res;
38
+ })
39
+ .define_method(
40
+ "lr_a",
41
+ *[](tomoto::IDTModel& self) {
42
+ return self.getShapeA();
43
+ })
44
+ .define_method(
45
+ "lr_a=",
46
+ *[](tomoto::IDTModel& self, tomoto::Float value) {
47
+ self.setShapeA(value);
48
+ return value;
49
+ })
50
+ .define_method(
51
+ "lr_b",
52
+ *[](tomoto::IDTModel& self) {
53
+ return self.getShapeB();
54
+ })
55
+ .define_method(
56
+ "lr_b=",
57
+ *[](tomoto::IDTModel& self, tomoto::Float value) {
58
+ self.setShapeB(value);
59
+ return value;
60
+ })
61
+ .define_method(
62
+ "lr_c",
63
+ *[](tomoto::IDTModel& self) {
64
+ return self.getShapeC();
65
+ })
66
+ .define_method(
67
+ "lr_c=",
68
+ *[](tomoto::IDTModel& self, tomoto::Float value) {
69
+ self.setShapeC(value);
70
+ return value;
71
+ })
72
+ .define_method(
73
+ "num_docs_by_timepoint",
74
+ *[](tomoto::IDTModel& self) {
75
+ return self.getNumDocsByT();
76
+ })
77
+ .define_method(
78
+ "num_timepoints",
79
+ *[](tomoto::IDTModel& self) {
80
+ return self.getT();
81
+ });
82
+ }
data/ext/tomoto/ext.cpp CHANGED
@@ -1,111 +1,23 @@
1
- // stdlib
2
- #include <fstream>
3
- #include <iostream>
4
-
5
- // tomoto
6
- #include <CT.h>
7
- #include <DMR.h>
8
- #include <DT.h>
9
- #include <GDMR.h>
10
- #include <HDP.h>
11
- #include <HLDA.h>
12
- #include <HPA.h>
13
- #include <LDA.h>
14
- #include <LLDA.h>
15
- #include <MGLDA.h>
16
- #include <PA.h>
17
- #include <PLDA.h>
18
- #include <SLDA.h>
19
-
20
- // rice
21
- #include <rice/Array.hpp>
22
- #include <rice/Hash.hpp>
23
1
  #include <rice/Module.hpp>
24
2
 
25
- using Rice::Array;
26
- using Rice::Class;
27
- using Rice::Hash;
28
- using Rice::Module;
29
- using Rice::Object;
30
- using Rice::define_class_under;
31
- using Rice::define_module;
32
-
33
- template<>
34
- Object to_ruby<std::vector<tomoto::Float>>(std::vector<tomoto::Float> const & x)
35
- {
36
- Array res;
37
- for (auto const& v : x) {
38
- res.push(v);
39
- }
40
- return res;
41
- }
42
-
43
- template<>
44
- Object to_ruby<std::vector<uint32_t>>(std::vector<uint32_t> const & x)
45
- {
46
- Array res;
47
- for (auto const& v : x) {
48
- res.push(v);
49
- }
50
- return res;
51
- }
52
-
53
- template<>
54
- Object to_ruby<std::vector<uint64_t>>(std::vector<uint64_t> const & x)
55
- {
56
- Array res;
57
- for (auto const& v : x) {
58
- res.push(v);
59
- }
60
- return res;
61
- }
62
-
63
- template<>
64
- std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
65
- {
66
- Array a = Array(x);
67
- std::vector<std::string> res;
68
- res.reserve(a.size());
69
- for (auto const& v : a) {
70
- res.push_back(from_ruby<std::string>(v));
71
- }
72
- return res;
73
- }
74
-
75
- template<>
76
- std::vector<tomoto::Float> from_ruby<std::vector<tomoto::Float>>(Object x)
77
- {
78
- Array a = Array(x);
79
- std::vector<tomoto::Float> res;
80
- res.reserve(a.size());
81
- for (auto const& v : a) {
82
- res.push_back(from_ruby<tomoto::Float>(v));
83
- }
84
- return res;
85
- }
86
-
87
- template<>
88
- std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
89
- {
90
- Array a = Array(x);
91
- std::vector<uint64_t> res;
92
- res.reserve(a.size());
93
- for (auto const& v : a) {
94
- res.push_back(from_ruby<uint64_t>(v));
95
- }
96
- return res;
97
- }
98
-
99
- tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
100
- tomoto::RawDoc doc;
101
- doc.rawWords = words;
102
- return doc;
103
- }
3
+ void init_lda(Rice::Module& m);
4
+ void init_ct(Rice::Module& m);
5
+ void init_dmr(Rice::Module& m);
6
+ void init_dt(Rice::Module& m);
7
+ void init_gdmr(Rice::Module& m);
8
+ void init_hdp(Rice::Module& m);
9
+ void init_hlda(Rice::Module& m);
10
+ void init_pa(Rice::Module& m);
11
+ void init_hpa(Rice::Module& m);
12
+ void init_mglda(Rice::Module& m);
13
+ void init_llda(Rice::Module& m);
14
+ void init_plda(Rice::Module& m);
15
+ void init_slda(Rice::Module& m);
104
16
 
105
17
  extern "C"
106
18
  void Init_ext()
107
19
  {
108
- Module rb_mTomoto = define_module("Tomoto")
20
+ auto m = Rice::define_module("Tomoto")
109
21
  .define_singleton_method(
110
22
  "isa",
111
23
  *[]() {
@@ -120,675 +32,17 @@ void Init_ext()
120
32
  #endif
121
33
  });
122
34
 
123
- Class rb_cLDA = define_class_under<tomoto::ILDAModel>(rb_mTomoto, "LDA")
124
- .define_singleton_method(
125
- "_new",
126
- *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
127
- if (seed < 0) {
128
- seed = std::random_device{}();
129
- }
130
- return tomoto::ILDAModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
131
- })
132
- .define_method(
133
- "_add_doc",
134
- *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
135
- self.addDoc(buildDoc(words));
136
- })
137
- .define_method(
138
- "alpha",
139
- *[](tomoto::ILDAModel& self) {
140
- Array res;
141
- for (size_t i = 0; i < self.getK(); i++) {
142
- res.push(self.getAlpha(i));
143
- }
144
- return res;
145
- })
146
- .define_method(
147
- "burn_in",
148
- *[](tomoto::ILDAModel& self) {
149
- return self.getBurnInIteration();
150
- })
151
- .define_method(
152
- "burn_in=",
153
- *[](tomoto::ILDAModel& self, size_t iteration) {
154
- self.setBurnInIteration(iteration);
155
- return iteration;
156
- })
157
- .define_method(
158
- "_count_by_topics",
159
- *[](tomoto::ILDAModel& self) {
160
- Array res;
161
- for (auto const& v : self.getCountByTopic()) {
162
- res.push(v);
163
- }
164
- return res;
165
- })
166
- .define_method(
167
- "eta",
168
- *[](tomoto::ILDAModel& self) {
169
- return self.getEta();
170
- })
171
- .define_method(
172
- "global_step",
173
- *[](tomoto::ILDAModel& self) {
174
- return self.getGlobalStep();
175
- })
176
- .define_method(
177
- "k",
178
- *[](tomoto::ILDAModel& self) {
179
- return self.getK();
180
- })
181
- .define_method(
182
- "_load",
183
- *[](tomoto::ILDAModel& self, const char* filename) {
184
- std::ifstream str{ filename, std::ios_base::binary };
185
- if (!str) throw std::runtime_error{ std::string("cannot open file '") + filename + std::string("'") };
186
- std::vector<uint8_t> extra_data;
187
- self.loadModel(str, &extra_data);
188
- })
189
- .define_method(
190
- "ll_per_word",
191
- *[](tomoto::ILDAModel& self) {
192
- return self.getLLPerWord();
193
- })
194
- .define_method(
195
- "num_docs",
196
- *[](tomoto::ILDAModel& self) {
197
- return self.getNumDocs();
198
- })
199
- .define_method(
200
- "num_vocabs",
201
- *[](tomoto::ILDAModel& self) {
202
- return self.getV();
203
- })
204
- .define_method(
205
- "num_words",
206
- *[](tomoto::ILDAModel& self) {
207
- return self.getN();
208
- })
209
- .define_method(
210
- "optim_interval",
211
- *[](tomoto::ILDAModel& self) {
212
- return self.getOptimInterval();
213
- })
214
- .define_method(
215
- "optim_interval=",
216
- *[](tomoto::ILDAModel& self, size_t value) {
217
- self.setOptimInterval(value);
218
- return value;
219
- })
220
- .define_method(
221
- "perplexity",
222
- *[](tomoto::ILDAModel& self) {
223
- return self.getPerplexity();
224
- })
225
- .define_method(
226
- "_prepare",
227
- *[](tomoto::ILDAModel& self, size_t minCnt, size_t minDf, size_t rmTop) {
228
- self.prepare(true, minCnt, minDf, rmTop);
229
- })
230
- .define_method(
231
- "_removed_top_words",
232
- *[](tomoto::ILDAModel& self, size_t rmTop) {
233
- Array res;
234
- auto dict = self.getVocabDict();
235
- size_t size = dict.size();
236
- for (size_t i = rmTop; i > 0; i--) {
237
- res.push(dict.toWord(size - i));
238
- }
239
- return res;
240
- })
241
- .define_method(
242
- "_save",
243
- *[](tomoto::ILDAModel& self, const char* filename, bool full) {
244
- std::ofstream str{ filename, std::ios_base::binary };
245
- std::vector<uint8_t> extra_data;
246
- self.saveModel(str, full, &extra_data);
247
- })
248
- .define_method(
249
- "_topic_words",
250
- *[](tomoto::ILDAModel& self, size_t topicId, size_t topN) {
251
- Hash res;
252
- for (auto const& v : self.getWordsByTopicSorted(topicId, topN)) {
253
- res[v.first] = v.second;
254
- }
255
- return res;
256
- })
257
- .define_method(
258
- "_train",
259
- *[](tomoto::ILDAModel& self, size_t iteration, size_t workers, size_t ps) {
260
- self.train(iteration, workers, (tomoto::ParallelScheme)ps);
261
- })
262
- .define_method(
263
- "_tw",
264
- *[](tomoto::ILDAModel& self) {
265
- return (int)self.getTermWeight();
266
- })
267
- .define_method(
268
- "used_vocab_df",
269
- *[](tomoto::ILDAModel& self) {
270
- auto vocab = self.getVocabDf();
271
- Array res;
272
- for (size_t i = 0; i < self.getV(); i++) {
273
- res.push(vocab[i]);
274
- }
275
- return res;
276
- })
277
- .define_method(
278
- "used_vocab_freq",
279
- *[](tomoto::ILDAModel& self) {
280
- auto vocab = self.getVocabCf();
281
- Array res;
282
- for (size_t i = 0; i < self.getV(); i++) {
283
- res.push(vocab[i]);
284
- }
285
- return res;
286
- })
287
- .define_method(
288
- "used_vocabs",
289
- *[](tomoto::ILDAModel& self) {
290
- auto dict = self.getVocabDict();
291
- Array res;
292
- auto utf8 = Class(rb_cEncoding).call("const_get", "UTF_8");
293
- for (size_t i = 0; i < self.getV(); i++) {
294
- res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
295
- }
296
- return res;
297
- })
298
- .define_method(
299
- "vocab_df",
300
- *[](tomoto::ILDAModel& self) {
301
- auto vocab = self.getVocabDf();
302
- Array res;
303
- for (size_t i = 0; i < vocab.size(); i++) {
304
- res.push(vocab[i]);
305
- }
306
- return res;
307
- })
308
- .define_method(
309
- "vocab_freq",
310
- *[](tomoto::ILDAModel& self) {
311
- auto vocab = self.getVocabCf();
312
- Array res;
313
- for (size_t i = 0; i < vocab.size(); i++) {
314
- res.push(vocab[i]);
315
- }
316
- return res;
317
- })
318
- .define_method(
319
- "vocabs",
320
- *[](tomoto::ILDAModel& self) {
321
- auto dict = self.getVocabDict();
322
- Array res;
323
- auto utf8 = Class(rb_cEncoding).call("const_get", "UTF_8");
324
- for (size_t i = 0; i < dict.size(); i++) {
325
- res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
326
- }
327
- return res;
328
- });
329
-
330
- Class rb_cCT = define_class_under<tomoto::ICTModel, tomoto::ILDAModel>(rb_mTomoto, "CT")
331
- .define_singleton_method(
332
- "_new",
333
- *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
334
- if (seed < 0) {
335
- seed = std::random_device{}();
336
- }
337
- return tomoto::ICTModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
338
- })
339
- .define_method(
340
- "_correlations",
341
- *[](tomoto::ICTModel& self, tomoto::Tid topic_id) {
342
- return self.getCorrelationTopic(topic_id);
343
- })
344
- .define_method(
345
- "num_beta_sample",
346
- *[](tomoto::ICTModel& self) {
347
- return self.getNumBetaSample();
348
- })
349
- .define_method(
350
- "num_beta_sample=",
351
- *[](tomoto::ICTModel& self, size_t value) {
352
- self.setNumBetaSample(value);
353
- return value;
354
- })
355
- .define_method(
356
- "num_tmn_sample",
357
- *[](tomoto::ICTModel& self) {
358
- return self.getNumTMNSample();
359
- })
360
- .define_method(
361
- "num_tmn_sample=",
362
- *[](tomoto::ICTModel& self, size_t value) {
363
- self.setNumTMNSample(value);
364
- return value;
365
- })
366
- .define_method(
367
- "_prior_cov",
368
- *[](tomoto::ICTModel& self) {
369
- return self.getPriorCov();
370
- })
371
- .define_method(
372
- "prior_mean",
373
- *[](tomoto::ICTModel& self) {
374
- return self.getPriorMean();
375
- });
376
-
377
- Class rb_cDMR = define_class_under<tomoto::IDMRModel, tomoto::ILDAModel>(rb_mTomoto, "DMR")
378
- .define_singleton_method(
379
- "_new",
380
- *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float sigma, tomoto::Float eta, tomoto::Float alpha_epsilon, int seed) {
381
- if (seed < 0) {
382
- seed = std::random_device{}();
383
- }
384
- return tomoto::IDMRModel::create((tomoto::TermWeight)tw, k, alpha, sigma, eta, alpha_epsilon, seed);
385
- })
386
- .define_method(
387
- "_add_doc",
388
- *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::string metadata) {
389
- auto doc = buildDoc(words);
390
- doc.misc["metadata"] = metadata;
391
- self.addDoc(doc);
392
- })
393
- .define_method(
394
- "alpha_epsilon",
395
- *[](tomoto::IDMRModel& self) {
396
- return self.getAlphaEps();
397
- })
398
- .define_method(
399
- "alpha_epsilon=",
400
- *[](tomoto::IDMRModel& self, tomoto::Float value) {
401
- self.setAlphaEps(value);
402
- return value;
403
- })
404
- .define_method(
405
- "f",
406
- *[](tomoto::IDMRModel& self) {
407
- return self.getF();
408
- })
409
- .define_method(
410
- "_lambdas",
411
- *[](tomoto::IDMRModel& self, tomoto::Tid topic_id) {
412
- return self.getLambdaByTopic(topic_id);
413
- })
414
- .define_method(
415
- "metadata_dict",
416
- *[](tomoto::IDMRModel& self) {
417
- auto dict = self.getMetadataDict();
418
- Array res;
419
- auto utf8 = Class(rb_cEncoding).call("const_get", "UTF_8");
420
- for (size_t i = 0; i < dict.size(); i++) {
421
- res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
422
- }
423
- return res;
424
- })
425
- .define_method(
426
- "sigma",
427
- *[](tomoto::IDMRModel& self) {
428
- return self.getSigma();
429
- });
430
-
431
- Class rb_cDT = define_class_under<tomoto::IDTModel, tomoto::ILDAModel>(rb_mTomoto, "DT")
432
- .define_singleton_method(
433
- "_new",
434
- *[](size_t tw, size_t k, size_t t, tomoto::Float alphaVar, tomoto::Float etaVar, tomoto::Float phiVar, tomoto::Float shapeA, tomoto::Float shapeB, tomoto::Float shapeC) {
435
- // Rice only supports 10 arguments
436
- int seed = -1;
437
- if (seed < 0) {
438
- seed = std::random_device{}();
439
- }
440
- return tomoto::IDTModel::create((tomoto::TermWeight)tw, k, t, alphaVar, etaVar, phiVar, shapeA, shapeB, shapeC, 0, seed);
441
- })
442
- .define_method(
443
- "_add_doc",
444
- *[](tomoto::IDTModel& self, std::vector<std::string> words, uint32_t timepoint) {
445
- auto doc = buildDoc(words);
446
- doc.misc["timepoint"] = timepoint;
447
- self.addDoc(doc);
448
- })
449
- .define_method(
450
- "lr_a",
451
- *[](tomoto::IDTModel& self) {
452
- return self.getShapeA();
453
- })
454
- .define_method(
455
- "lr_a=",
456
- *[](tomoto::IDTModel& self, tomoto::Float value) {
457
- self.setShapeA(value);
458
- return value;
459
- })
460
- .define_method(
461
- "lr_b",
462
- *[](tomoto::IDTModel& self) {
463
- return self.getShapeB();
464
- })
465
- .define_method(
466
- "lr_b=",
467
- *[](tomoto::IDTModel& self, tomoto::Float value) {
468
- self.setShapeB(value);
469
- return value;
470
- })
471
- .define_method(
472
- "lr_c",
473
- *[](tomoto::IDTModel& self) {
474
- return self.getShapeC();
475
- })
476
- .define_method(
477
- "lr_c=",
478
- *[](tomoto::IDTModel& self, tomoto::Float value) {
479
- self.setShapeC(value);
480
- return value;
481
- })
482
- .define_method(
483
- "num_docs_by_timepoint",
484
- *[](tomoto::IDTModel& self) {
485
- return self.getNumDocsByT();
486
- })
487
- .define_method(
488
- "num_timepoints",
489
- *[](tomoto::IDTModel& self) {
490
- return self.getT();
491
- });
492
-
493
- Class rb_cGDMR = define_class_under<tomoto::IGDMRModel, tomoto::IDMRModel>(rb_mTomoto, "GDMR")
494
- .define_singleton_method(
495
- "_new",
496
- *[](size_t tw, size_t k, std::vector<uint64_t> degrees, tomoto::Float alpha, tomoto::Float sigma, tomoto::Float sigma0, tomoto::Float eta, tomoto::Float alpha_epsilon, int seed) {
497
- if (seed < 0) {
498
- seed = std::random_device{}();
499
- }
500
- return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
501
- })
502
- .define_method(
503
- "_add_doc",
504
- *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<tomoto::Float> metadata) {
505
- auto doc = buildDoc(words);
506
- doc.misc["metadata"] = metadata;
507
- self.addDoc(doc);
508
- })
509
- .define_method(
510
- "degrees",
511
- *[](tomoto::IGDMRModel& self) {
512
- return self.getFs();
513
- })
514
- .define_method(
515
- "sigma0",
516
- *[](tomoto::IGDMRModel& self) {
517
- return self.getSigma0();
518
- });
519
-
520
- Class rb_cHDP = define_class_under<tomoto::IHDPModel, tomoto::ILDAModel>(rb_mTomoto, "HDP")
521
- .define_singleton_method(
522
- "_new",
523
- *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, tomoto::Float gamma, int seed) {
524
- if (seed < 0) {
525
- seed = std::random_device{}();
526
- }
527
- return tomoto::IHDPModel::create((tomoto::TermWeight)tw, k, alpha, eta, gamma, seed);
528
- })
529
- .define_method(
530
- "alpha",
531
- *[](tomoto::IHDPModel& self) {
532
- return self.getAlpha();
533
- })
534
- .define_method(
535
- "gamma",
536
- *[](tomoto::IHDPModel& self) {
537
- return self.getGamma();
538
- })
539
- .define_method(
540
- "live_k",
541
- *[](tomoto::IHDPModel& self) {
542
- return self.getLiveK();
543
- })
544
- .define_method(
545
- "live_topic?",
546
- *[](tomoto::IHDPModel& self, size_t tid) {
547
- return self.isLiveTopic(tid);
548
- })
549
- .define_method(
550
- "num_tables",
551
- *[](tomoto::IHDPModel& self) {
552
- return self.getTotalTables();
553
- });
554
-
555
- Class rb_cHLDA = define_class_under<tomoto::IHLDAModel, tomoto::ILDAModel>(rb_mTomoto, "HLDA")
556
- .define_singleton_method(
557
- "_new",
558
- *[](size_t tw, size_t levelDepth, tomoto::Float alpha, tomoto::Float eta, tomoto::Float gamma, int seed) {
559
- if (seed < 0) {
560
- seed = std::random_device{}();
561
- }
562
- return tomoto::IHLDAModel::create((tomoto::TermWeight)tw, levelDepth, alpha, eta, gamma, seed);
563
- })
564
- .define_method(
565
- "alpha",
566
- *[](tomoto::IHLDAModel& self) {
567
- Array res;
568
- for (size_t i = 0; i < self.getLevelDepth(); i++) {
569
- res.push(self.getAlpha(i));
570
- }
571
- return res;
572
- })
573
- .define_method(
574
- "_children_topics",
575
- *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
576
- return self.getChildTopicId(topic_id);
577
- })
578
- .define_method(
579
- "depth",
580
- *[](tomoto::IHLDAModel& self) {
581
- return self.getLevelDepth();
582
- })
583
- .define_method(
584
- "gamma",
585
- *[](tomoto::IHLDAModel& self) {
586
- return self.getGamma();
587
- })
588
- .define_method(
589
- "_level",
590
- *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
591
- return self.getLevelOfTopic(topic_id);
592
- })
593
- .define_method(
594
- "live_k",
595
- *[](tomoto::IHLDAModel& self) {
596
- return self.getLiveK();
597
- })
598
- .define_method(
599
- "_live_topic?",
600
- *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
601
- return self.isLiveTopic(topic_id);
602
- })
603
- .define_method(
604
- "_num_docs_of_topic",
605
- *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
606
- return self.getNumDocsOfTopic(topic_id);
607
- })
608
- .define_method(
609
- "_parent_topic",
610
- *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
611
- return self.getParentTopicId(topic_id);
612
- });
613
-
614
- Class rb_cPA = define_class_under<tomoto::IPAModel, tomoto::ILDAModel>(rb_mTomoto, "PA")
615
- .define_singleton_method(
616
- "_new",
617
- *[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, int seed) {
618
- if (seed < 0) {
619
- seed = std::random_device{}();
620
- }
621
- return tomoto::IPAModel::create((tomoto::TermWeight)tw, k1, k2, alpha, eta, seed);
622
- })
623
- .define_method(
624
- "k1",
625
- *[](tomoto::IPAModel& self) {
626
- return self.getK();
627
- })
628
- .define_method(
629
- "k2",
630
- *[](tomoto::IPAModel& self) {
631
- return self.getK2();
632
- });
633
-
634
- Class rb_cHPA = define_class_under<tomoto::IHPAModel, tomoto::IPAModel>(rb_mTomoto, "HPA")
635
- .define_singleton_method(
636
- "_new",
637
- *[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, int seed) {
638
- if (seed < 0) {
639
- seed = std::random_device{}();
640
- }
641
- return tomoto::IHPAModel::create((tomoto::TermWeight)tw, false, k1, k2, alpha, eta, seed);
642
- })
643
- .define_method(
644
- "alpha",
645
- *[](tomoto::IHPAModel& self) {
646
- Array res;
647
- // use <= to return k+1 elements
648
- for (size_t i = 0; i <= self.getK(); i++) {
649
- res.push(self.getAlpha(i));
650
- }
651
- return res;
652
- });
653
-
654
- Class rb_cMGLDA = define_class_under<tomoto::IMGLDAModel, tomoto::ILDAModel>(rb_mTomoto, "MGLDA")
655
- .define_singleton_method(
656
- "_new",
657
- *[](size_t tw, size_t k_g, size_t k_l, size_t t, tomoto::Float alpha_g, tomoto::Float alpha_l, tomoto::Float alpha_mg, tomoto::Float alpha_ml, tomoto::Float eta_g) {
658
- return tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g);
659
- })
660
- .define_method(
661
- "_add_doc",
662
- *[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
663
- auto doc = buildDoc(words);
664
- doc.misc["delimiter"] = delimiter;
665
- self.addDoc(doc);
666
- })
667
- .define_method(
668
- "alpha_g",
669
- *[](tomoto::IMGLDAModel& self) {
670
- return self.getAlpha();
671
- })
672
- .define_method(
673
- "alpha_l",
674
- *[](tomoto::IMGLDAModel& self) {
675
- return self.getAlphaL();
676
- })
677
- .define_method(
678
- "alpha_mg",
679
- *[](tomoto::IMGLDAModel& self) {
680
- return self.getAlphaM();
681
- })
682
- .define_method(
683
- "alpha_ml",
684
- *[](tomoto::IMGLDAModel& self) {
685
- return self.getAlphaML();
686
- })
687
- .define_method(
688
- "eta_g",
689
- *[](tomoto::IMGLDAModel& self) {
690
- return self.getEta();
691
- })
692
- .define_method(
693
- "eta_l",
694
- *[](tomoto::IMGLDAModel& self) {
695
- return self.getEtaL();
696
- })
697
- .define_method(
698
- "gamma",
699
- *[](tomoto::IMGLDAModel& self) {
700
- return self.getGamma();
701
- })
702
- .define_method(
703
- "k_g",
704
- *[](tomoto::IMGLDAModel& self) {
705
- return self.getK();
706
- })
707
- .define_method(
708
- "k_l",
709
- *[](tomoto::IMGLDAModel& self) {
710
- return self.getKL();
711
- })
712
- .define_method(
713
- "t",
714
- *[](tomoto::IMGLDAModel& self) {
715
- return self.getT();
716
- });
717
-
718
- Class rb_cLLDA = define_class_under<tomoto::ILLDAModel, tomoto::ILDAModel>(rb_mTomoto, "LLDA")
719
- .define_singleton_method(
720
- "_new",
721
- *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
722
- if (seed < 0) {
723
- seed = std::random_device{}();
724
- }
725
- return tomoto::ILLDAModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
726
- })
727
- .define_method(
728
- "_add_doc",
729
- *[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
730
- auto doc = buildDoc(words);
731
- doc.misc["labels"] = labels;
732
- self.addDoc(doc);
733
- })
734
- .define_method(
735
- "topics_per_label",
736
- *[](tomoto::ILLDAModel& self) {
737
- return self.getNumTopicsPerLabel();
738
- });
739
-
740
- Class rb_cPLDA = define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(rb_mTomoto, "PLDA")
741
- .define_singleton_method(
742
- "_new",
743
- *[](size_t tw, size_t latent_topics, tomoto::Float alpha, tomoto::Float eta, int seed) {
744
- if (seed < 0) {
745
- seed = std::random_device{}();
746
- }
747
- return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, latent_topics, 1, alpha, eta, seed);
748
- })
749
- .define_method(
750
- "_add_doc",
751
- *[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
752
- auto doc = buildDoc(words);
753
- doc.misc["labels"] = labels;
754
- self.addDoc(doc);
755
- })
756
- .define_method(
757
- "latent_topics",
758
- *[](tomoto::IPLDAModel& self) {
759
- return self.getNumLatentTopics();
760
- });
761
-
762
- Class rb_cSLDA = define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(rb_mTomoto, "SLDA")
763
- .define_singleton_method(
764
- "_new",
765
- *[](size_t tw, size_t k, Array rb_vars, tomoto::Float alpha, tomoto::Float eta, std::vector<tomoto::Float> mu, std::vector<tomoto::Float> nu_sq, std::vector<tomoto::Float> glm_param, int seed) {
766
- if (seed < 0) {
767
- seed = std::random_device{}();
768
- }
769
- std::vector<tomoto::ISLDAModel::GLM> vars;
770
- vars.reserve(rb_vars.size());
771
- for (auto const& v : rb_vars) {
772
- vars.push_back((tomoto::ISLDAModel::GLM) from_ruby<int>(v));
773
- }
774
- return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, k, vars, alpha, eta, mu, nu_sq, glm_param, seed);
775
- })
776
- .define_method(
777
- "_add_doc",
778
- *[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
779
- auto doc = buildDoc(words);
780
- doc.misc["y"] = y;
781
- self.addDoc(doc);
782
- })
783
- .define_method(
784
- "f",
785
- *[](tomoto::ISLDAModel& self) {
786
- return self.getF();
787
- })
788
- .define_method(
789
- "_var_type",
790
- *[](tomoto::ISLDAModel& self, size_t var_id) {
791
- if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
792
- return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
793
- });
35
+ init_lda(m);
36
+ init_ct(m);
37
+ init_dmr(m);
38
+ init_dt(m);
39
+ init_gdmr(m);
40
+ init_hdp(m);
41
+ init_hlda(m);
42
+ init_pa(m);
43
+ init_hpa(m);
44
+ init_mglda(m);
45
+ init_llda(m);
46
+ init_plda(m);
47
+ init_slda(m);
794
48
  }