tomoto 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +7 -0
  5. data/ext/tomoto/ct.cpp +54 -0
  6. data/ext/tomoto/dmr.cpp +62 -0
  7. data/ext/tomoto/dt.cpp +82 -0
  8. data/ext/tomoto/ext.cpp +27 -773
  9. data/ext/tomoto/gdmr.cpp +34 -0
  10. data/ext/tomoto/hdp.cpp +42 -0
  11. data/ext/tomoto/hlda.cpp +66 -0
  12. data/ext/tomoto/hpa.cpp +27 -0
  13. data/ext/tomoto/lda.cpp +250 -0
  14. data/ext/tomoto/llda.cpp +29 -0
  15. data/ext/tomoto/mglda.cpp +71 -0
  16. data/ext/tomoto/pa.cpp +27 -0
  17. data/ext/tomoto/plda.cpp +29 -0
  18. data/ext/tomoto/slda.cpp +40 -0
  19. data/ext/tomoto/utils.h +84 -0
  20. data/lib/tomoto/tomoto.bundle +0 -0
  21. data/lib/tomoto/tomoto.so +0 -0
  22. data/lib/tomoto/version.rb +1 -1
  23. data/vendor/tomotopy/README.kr.rst +12 -3
  24. data/vendor/tomotopy/README.rst +12 -3
  25. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +47 -2
  26. data/vendor/tomotopy/src/Labeling/FoRelevance.h +21 -151
  27. data/vendor/tomotopy/src/Labeling/Labeler.h +5 -3
  28. data/vendor/tomotopy/src/Labeling/Phraser.hpp +518 -0
  29. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +6 -3
  30. data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
  31. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +8 -23
  32. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +9 -18
  33. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +56 -58
  34. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +4 -14
  35. data/vendor/tomotopy/src/TopicModel/LDA.h +69 -17
  36. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +1 -1
  37. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +108 -61
  38. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -8
  39. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +26 -16
  40. data/vendor/tomotopy/src/TopicModel/PT.h +27 -0
  41. data/vendor/tomotopy/src/TopicModel/PTModel.cpp +10 -0
  42. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +273 -0
  43. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +16 -11
  44. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +3 -2
  45. data/vendor/tomotopy/src/Utils/Trie.hpp +39 -8
  46. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +36 -38
  47. data/vendor/tomotopy/src/Utils/Utils.hpp +50 -45
  48. data/vendor/tomotopy/src/Utils/math.h +8 -4
  49. data/vendor/tomotopy/src/Utils/tvector.hpp +4 -0
  50. metadata +24 -60
@@ -0,0 +1,34 @@
1
+ #include <GDMR.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_gdmr(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IGDMRModel, tomoto::IDMRModel>(m, "GDMR")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k, std::vector<uint64_t> degrees, tomoto::Float alpha, tomoto::Float sigma, tomoto::Float sigma0, tomoto::Float eta, tomoto::Float alpha_epsilon, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
16
+ })
17
+ .define_method(
18
+ "_add_doc",
19
+ *[](tomoto::IGDMRModel& self, std::vector<std::string> words, std::vector<tomoto::Float> metadata) {
20
+ auto doc = buildDoc(words);
21
+ doc.misc["metadata"] = metadata;
22
+ return self.addDoc(doc);
23
+ })
24
+ .define_method(
25
+ "degrees",
26
+ *[](tomoto::IGDMRModel& self) {
27
+ return self.getFs();
28
+ })
29
+ .define_method(
30
+ "sigma0",
31
+ *[](tomoto::IGDMRModel& self) {
32
+ return self.getSigma0();
33
+ });
34
+ }
@@ -0,0 +1,42 @@
1
+ #include <HDP.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_hdp(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IHDPModel, tomoto::ILDAModel>(m, "HDP")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, tomoto::Float gamma, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ return tomoto::IHDPModel::create((tomoto::TermWeight)tw, k, alpha, eta, gamma, seed);
16
+ })
17
+ .define_method(
18
+ "alpha",
19
+ *[](tomoto::IHDPModel& self) {
20
+ return self.getAlpha();
21
+ })
22
+ .define_method(
23
+ "gamma",
24
+ *[](tomoto::IHDPModel& self) {
25
+ return self.getGamma();
26
+ })
27
+ .define_method(
28
+ "live_k",
29
+ *[](tomoto::IHDPModel& self) {
30
+ return self.getLiveK();
31
+ })
32
+ .define_method(
33
+ "live_topic?",
34
+ *[](tomoto::IHDPModel& self, size_t tid) {
35
+ return self.isLiveTopic(tid);
36
+ })
37
+ .define_method(
38
+ "num_tables",
39
+ *[](tomoto::IHDPModel& self) {
40
+ return self.getTotalTables();
41
+ });
42
+ }
@@ -0,0 +1,66 @@
1
+ #include <HLDA.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_hlda(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IHLDAModel, tomoto::ILDAModel>(m, "HLDA")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t levelDepth, tomoto::Float alpha, tomoto::Float eta, tomoto::Float gamma, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ return tomoto::IHLDAModel::create((tomoto::TermWeight)tw, levelDepth, alpha, eta, gamma, seed);
16
+ })
17
+ .define_method(
18
+ "alpha",
19
+ *[](tomoto::IHLDAModel& self) {
20
+ Array res;
21
+ for (size_t i = 0; i < self.getLevelDepth(); i++) {
22
+ res.push(self.getAlpha(i));
23
+ }
24
+ return res;
25
+ })
26
+ .define_method(
27
+ "_children_topics",
28
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
29
+ return self.getChildTopicId(topic_id);
30
+ })
31
+ .define_method(
32
+ "depth",
33
+ *[](tomoto::IHLDAModel& self) {
34
+ return self.getLevelDepth();
35
+ })
36
+ .define_method(
37
+ "gamma",
38
+ *[](tomoto::IHLDAModel& self) {
39
+ return self.getGamma();
40
+ })
41
+ .define_method(
42
+ "_level",
43
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
44
+ return self.getLevelOfTopic(topic_id);
45
+ })
46
+ .define_method(
47
+ "live_k",
48
+ *[](tomoto::IHLDAModel& self) {
49
+ return self.getLiveK();
50
+ })
51
+ .define_method(
52
+ "_live_topic?",
53
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
54
+ return self.isLiveTopic(topic_id);
55
+ })
56
+ .define_method(
57
+ "_num_docs_of_topic",
58
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
59
+ return self.getNumDocsOfTopic(topic_id);
60
+ })
61
+ .define_method(
62
+ "_parent_topic",
63
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
64
+ return self.getParentTopicId(topic_id);
65
+ });
66
+ }
@@ -0,0 +1,27 @@
1
+ #include <HPA.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_hpa(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::IHPAModel, tomoto::IPAModel>(m, "HPA")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k1, size_t k2, tomoto::Float alpha, tomoto::Float eta, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ return tomoto::IHPAModel::create((tomoto::TermWeight)tw, false, k1, k2, alpha, eta, seed);
16
+ })
17
+ .define_method(
18
+ "alpha",
19
+ *[](tomoto::IHPAModel& self) {
20
+ Array res;
21
+ // use <= to return k+1 elements
22
+ for (size_t i = 0; i <= self.getK(); i++) {
23
+ res.push(self.getAlpha(i));
24
+ }
25
+ return res;
26
+ });
27
+ }
@@ -0,0 +1,250 @@
1
+ #include <fstream>
2
+ #include <iostream>
3
+
4
+ #include <LDA.h>
5
+
6
+ #include <rice/Class.hpp>
7
+ #include <rice/Hash.hpp>
8
+ #include <rice/Module.hpp>
9
+
10
+ #include "utils.h"
11
+
12
+ class DocumentObject
13
+ {
14
+ public:
15
+ DocumentObject(const tomoto::DocumentBase* _doc, const tomoto::ITopicModel* _tm) : doc{ _doc }, tm{ _tm } {}
16
+
17
+ const tomoto::DocumentBase* doc;
18
+ const tomoto::ITopicModel* tm;
19
+ };
20
+
21
+ void init_lda(Rice::Module& m) {
22
+ Rice::define_class_under<DocumentObject>(m, "Document")
23
+ .define_method(
24
+ "topics",
25
+ *[](DocumentObject& self) {
26
+ Rice::Hash res;
27
+ auto topics = self.tm->getTopicsByDoc(self.doc);
28
+ for (size_t i = 0; i < topics.size(); i++) {
29
+ res[i] = topics[i];
30
+ }
31
+ return res;
32
+ });
33
+
34
+ Rice::define_class_under<tomoto::ILDAModel>(m, "LDA")
35
+ .define_singleton_method(
36
+ "_new",
37
+ *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
38
+ if (seed < 0) {
39
+ seed = std::random_device{}();
40
+ }
41
+ return tomoto::ILDAModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
42
+ })
43
+ .define_method(
44
+ "_add_doc",
45
+ *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
46
+ return self.addDoc(buildDoc(words));
47
+ })
48
+ .define_method(
49
+ "alpha",
50
+ *[](tomoto::ILDAModel& self) {
51
+ Array res;
52
+ for (size_t i = 0; i < self.getK(); i++) {
53
+ res.push(self.getAlpha(i));
54
+ }
55
+ return res;
56
+ })
57
+ .define_method(
58
+ "burn_in",
59
+ *[](tomoto::ILDAModel& self) {
60
+ return self.getBurnInIteration();
61
+ })
62
+ .define_method(
63
+ "burn_in=",
64
+ *[](tomoto::ILDAModel& self, size_t iteration) {
65
+ self.setBurnInIteration(iteration);
66
+ return iteration;
67
+ })
68
+ .define_method(
69
+ "_count_by_topics",
70
+ *[](tomoto::ILDAModel& self) {
71
+ Array res;
72
+ for (auto const& v : self.getCountByTopic()) {
73
+ res.push(v);
74
+ }
75
+ return res;
76
+ })
77
+ .define_method(
78
+ "docs",
79
+ *[](tomoto::ILDAModel& self) {
80
+ Array res;
81
+ auto n = self.getNumDocs();
82
+ for (size_t i = 0; i < n; i++) {
83
+ res.push(DocumentObject(self.getDoc(i), &self));
84
+ }
85
+ return res;
86
+ })
87
+ .define_method(
88
+ "eta",
89
+ *[](tomoto::ILDAModel& self) {
90
+ return self.getEta();
91
+ })
92
+ .define_method(
93
+ "global_step",
94
+ *[](tomoto::ILDAModel& self) {
95
+ return self.getGlobalStep();
96
+ })
97
+ .define_method(
98
+ "k",
99
+ *[](tomoto::ILDAModel& self) {
100
+ return self.getK();
101
+ })
102
+ .define_method(
103
+ "_load",
104
+ *[](tomoto::ILDAModel& self, const char* filename) {
105
+ std::ifstream str{ filename, std::ios_base::binary };
106
+ if (!str) throw std::runtime_error{ std::string("cannot open file '") + filename + std::string("'") };
107
+ std::vector<uint8_t> extra_data;
108
+ self.loadModel(str, &extra_data);
109
+ })
110
+ .define_method(
111
+ "ll_per_word",
112
+ *[](tomoto::ILDAModel& self) {
113
+ return self.getLLPerWord();
114
+ })
115
+ .define_method(
116
+ "num_docs",
117
+ *[](tomoto::ILDAModel& self) {
118
+ return self.getNumDocs();
119
+ })
120
+ .define_method(
121
+ "num_vocabs",
122
+ *[](tomoto::ILDAModel& self) {
123
+ return self.getV();
124
+ })
125
+ .define_method(
126
+ "num_words",
127
+ *[](tomoto::ILDAModel& self) {
128
+ return self.getN();
129
+ })
130
+ .define_method(
131
+ "optim_interval",
132
+ *[](tomoto::ILDAModel& self) {
133
+ return self.getOptimInterval();
134
+ })
135
+ .define_method(
136
+ "optim_interval=",
137
+ *[](tomoto::ILDAModel& self, size_t value) {
138
+ self.setOptimInterval(value);
139
+ return value;
140
+ })
141
+ .define_method(
142
+ "perplexity",
143
+ *[](tomoto::ILDAModel& self) {
144
+ return self.getPerplexity();
145
+ })
146
+ .define_method(
147
+ "_prepare",
148
+ *[](tomoto::ILDAModel& self, size_t minCnt, size_t minDf, size_t rmTop) {
149
+ self.prepare(true, minCnt, minDf, rmTop);
150
+ })
151
+ .define_method(
152
+ "_removed_top_words",
153
+ *[](tomoto::ILDAModel& self, size_t rmTop) {
154
+ Array res;
155
+ auto dict = self.getVocabDict();
156
+ size_t size = dict.size();
157
+ for (size_t i = rmTop; i > 0; i--) {
158
+ res.push(dict.toWord(size - i));
159
+ }
160
+ return res;
161
+ })
162
+ .define_method(
163
+ "_save",
164
+ *[](tomoto::ILDAModel& self, const char* filename, bool full) {
165
+ std::ofstream str{ filename, std::ios_base::binary };
166
+ std::vector<uint8_t> extra_data;
167
+ self.saveModel(str, full, &extra_data);
168
+ })
169
+ .define_method(
170
+ "_topic_words",
171
+ *[](tomoto::ILDAModel& self, size_t topicId, size_t topN) {
172
+ Rice::Hash res;
173
+ for (auto const& v : self.getWordsByTopicSorted(topicId, topN)) {
174
+ res[v.first] = v.second;
175
+ }
176
+ return res;
177
+ })
178
+ .define_method(
179
+ "_train",
180
+ *[](tomoto::ILDAModel& self, size_t iteration, size_t workers, size_t ps) {
181
+ self.train(iteration, workers, (tomoto::ParallelScheme)ps);
182
+ })
183
+ .define_method(
184
+ "_tw",
185
+ *[](tomoto::ILDAModel& self) {
186
+ return (int)self.getTermWeight();
187
+ })
188
+ .define_method(
189
+ "used_vocab_df",
190
+ *[](tomoto::ILDAModel& self) {
191
+ auto vocab = self.getVocabDf();
192
+ Array res;
193
+ for (size_t i = 0; i < self.getV(); i++) {
194
+ res.push(vocab[i]);
195
+ }
196
+ return res;
197
+ })
198
+ .define_method(
199
+ "used_vocab_freq",
200
+ *[](tomoto::ILDAModel& self) {
201
+ auto vocab = self.getVocabCf();
202
+ Array res;
203
+ for (size_t i = 0; i < self.getV(); i++) {
204
+ res.push(vocab[i]);
205
+ }
206
+ return res;
207
+ })
208
+ .define_method(
209
+ "used_vocabs",
210
+ *[](tomoto::ILDAModel& self) {
211
+ auto dict = self.getVocabDict();
212
+ Array res;
213
+ auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
214
+ for (size_t i = 0; i < self.getV(); i++) {
215
+ res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
216
+ }
217
+ return res;
218
+ })
219
+ .define_method(
220
+ "vocab_df",
221
+ *[](tomoto::ILDAModel& self) {
222
+ auto vocab = self.getVocabDf();
223
+ Array res;
224
+ for (size_t i = 0; i < vocab.size(); i++) {
225
+ res.push(vocab[i]);
226
+ }
227
+ return res;
228
+ })
229
+ .define_method(
230
+ "vocab_freq",
231
+ *[](tomoto::ILDAModel& self) {
232
+ auto vocab = self.getVocabCf();
233
+ Array res;
234
+ for (size_t i = 0; i < vocab.size(); i++) {
235
+ res.push(vocab[i]);
236
+ }
237
+ return res;
238
+ })
239
+ .define_method(
240
+ "vocabs",
241
+ *[](tomoto::ILDAModel& self) {
242
+ auto dict = self.getVocabDict();
243
+ Array res;
244
+ auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
245
+ for (size_t i = 0; i < dict.size(); i++) {
246
+ res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
247
+ }
248
+ return res;
249
+ });
250
+ }
@@ -0,0 +1,29 @@
1
+ #include <LLDA.h>
2
+
3
+ #include <rice/Module.hpp>
4
+
5
+ #include "utils.h"
6
+
7
+ void init_llda(Rice::Module& m) {
8
+ Rice::define_class_under<tomoto::ILLDAModel, tomoto::ILDAModel>(m, "LLDA")
9
+ .define_singleton_method(
10
+ "_new",
11
+ *[](size_t tw, size_t k, tomoto::Float alpha, tomoto::Float eta, int seed) {
12
+ if (seed < 0) {
13
+ seed = std::random_device{}();
14
+ }
15
+ return tomoto::ILLDAModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
16
+ })
17
+ .define_method(
18
+ "_add_doc",
19
+ *[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
20
+ auto doc = buildDoc(words);
21
+ doc.misc["labels"] = labels;
22
+ return self.addDoc(doc);
23
+ })
24
+ .define_method(
25
+ "topics_per_label",
26
+ *[](tomoto::ILLDAModel& self) {
27
+ return self.getNumTopicsPerLabel();
28
+ });
29
+ }