tomoto 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a27c5c0ff4a71b0e0b084773adf7a2a0ede39152b210973787c12e98637cb7d3
4
- data.tar.gz: c70cabfa9e8685e86edae56c4b50c52a8bb6baf4d3f7684c3e28d7345e460551
3
+ metadata.gz: 3b40c9adf2f0162eb6174b17395ea37b9294e14b22609e9f51951e9904125ff9
4
+ data.tar.gz: be3f68438f60a7e4fc11033921636f8d03bf411bd3d3eb6aa3b4fb448faac41a
5
5
  SHA512:
6
- metadata.gz: d23a02abb149799facf1b557004ebc2d749131d37eb33e8f70e8aa109f117cc79a3db847326206fb08d5114b983959fa782082c9fe239e3573afe363d81f5066
7
- data.tar.gz: 6ca2548b92c30adea217437dfad8a5e0ef802c2789a21cd5d40d88514607889b854afa6a2b7e1ef06f7ecabaa1226655c0dfc29aaa5439993861c2895542ae98
6
+ metadata.gz: a74747ae372d030c42562d4e2b99ab167ccc28533468ed08819f4bd34d42b340349870712c12e565388eb7833f993349432e77baee8618c34c265676ca181072
7
+ data.tar.gz: da9e833bb98726278108a68a7dd6bed0e54b3979c25d49d8db01aa613e6205a6a3512881688209baf2589c4dc5514ed2663fb251a5e273c42b678bb1daa06d74
@@ -1,3 +1,7 @@
1
+ ## 0.1.1 (2020-10-10)
2
+
3
+ - Added many more models
4
+
1
5
  ## 0.1.0 (2020-10-09)
2
6
 
3
7
  - First release
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
- # Tomoto
1
+ # tomoto
2
2
 
3
- [Tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
3
+ :tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
4
+
5
+ [![Build Status](https://travis-ci.org/ankane/tomoto.svg?branch=master)](https://travis-ci.org/ankane/tomoto)
4
6
 
5
7
  ## Installation
6
8
 
@@ -10,7 +12,7 @@ Add this line to your application’s Gemfile:
10
12
  gem 'tomoto'
11
13
  ```
12
14
 
13
- It can take around 10 minutes to compile the extension.
15
+ It can take 10-20 minutes to compile the extension.
14
16
 
15
17
  ## Getting Started
16
18
 
@@ -65,23 +67,27 @@ model.ll_per_word
65
67
  Supports:
66
68
 
67
69
  - Latent Dirichlet Allocation (`LDA`)
70
+ - Labeled LDA (`LLDA`)
71
+ - Partially Labeled LDA (`PLDA`)
72
+ - Supervised LDA (`SLDA`)
73
+ - Dirichlet Multinomial Regression (`DMR`)
74
+ - Generalized Dirichlet Multinomial Regression (`GDMR`)
68
75
  - Hierarchical Dirichlet Process (`HDP`)
76
+ - Hierarchical LDA (`HLDA`)
77
+ - Multi Grain LDA (`MGLDA`)
78
+ - Pachinko Allocation (`PA`)
79
+ - Hierarchical PA (`HPA`)
69
80
  - Correlated Topic Model (`CT`)
81
+ - Dynamic Topic Model (`DT`)
70
82
 
71
- ## Parameters
83
+ ## API
72
84
 
73
- ```ruby
74
- Tomoto::LDA.new(
75
- tw: :one, # or :idf, :pmi
76
- min_cf: 0,
77
- min_df: 0,
78
- rm_top: 0,
79
- k: 1,
80
- alpha: 0.1,
81
- eta: 0.01,
82
- seed: nil
83
- )
84
- ```
85
+ This library follows the [tomotopy API](https://bab2min.github.io/tomotopy/v0.9.0/en/). There are a few changes to make it more Ruby-like:
86
+
87
+ - The `get_` prefix has been removed from methods (`topic_words` instead of `get_topic_words`)
88
+ - Methods that return booleans use `?` instead of `is_` (`live_topic?` instead of `is_live_topic`)
89
+
90
+ If a method or option you need isn’t supported, feel free to open an issue.
85
91
 
86
92
  ## Tokenization
87
93
 
@@ -93,7 +99,7 @@ model.add_doc(["tokens", "from", "document", "one"])
93
99
 
94
100
  ## Performance
95
101
 
96
- Tomoto uses AVX2, AVX, or SSE2 instructions to increase performance on machines that support it. Check what it’s using with:
102
+ tomoto uses AVX2, AVX, or SSE2 instructions to increase performance on machines that support it. Check what it’s using with:
97
103
 
98
104
  ```ruby
99
105
  Tomoto.isa
@@ -1,7 +1,21 @@
1
+ // stdlib
2
+ #include <fstream>
3
+ #include <iostream>
4
+
1
5
  // tomoto
2
6
  #include <CT.h>
7
+ #include <DMR.h>
8
+ #include <DT.h>
9
+ #include <GDMR.h>
3
10
  #include <HDP.h>
11
+ #include <HLDA.h>
12
+ #include <HPA.h>
4
13
  #include <LDA.h>
14
+ #include <LLDA.h>
15
+ #include <MGLDA.h>
16
+ #include <PA.h>
17
+ #include <PLDA.h>
18
+ #include <SLDA.h>
5
19
 
6
20
  // rice
7
21
  #include <rice/Array.hpp>
@@ -26,6 +40,62 @@ Object to_ruby<std::vector<float>>(std::vector<float> const & x)
26
40
  return res;
27
41
  }
28
42
 
43
+ template<>
44
+ Object to_ruby<std::vector<uint32_t>>(std::vector<uint32_t> const & x)
45
+ {
46
+ Array res;
47
+ for (auto const& v : x) {
48
+ res.push(v);
49
+ }
50
+ return res;
51
+ }
52
+
53
+ template<>
54
+ Object to_ruby<std::vector<uint64_t>>(std::vector<uint64_t> const & x)
55
+ {
56
+ Array res;
57
+ for (auto const& v : x) {
58
+ res.push(v);
59
+ }
60
+ return res;
61
+ }
62
+
63
+ template<>
64
+ std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
65
+ {
66
+ Array a = Array(x);
67
+ std::vector<std::string> res;
68
+ res.reserve(a.size());
69
+ for (auto const& v : a) {
70
+ res.push_back(from_ruby<std::string>(v));
71
+ }
72
+ return res;
73
+ }
74
+
75
+ template<>
76
+ std::vector<float> from_ruby<std::vector<float>>(Object x)
77
+ {
78
+ Array a = Array(x);
79
+ std::vector<float> res;
80
+ res.reserve(a.size());
81
+ for (auto const& v : a) {
82
+ res.push_back(from_ruby<float>(v));
83
+ }
84
+ return res;
85
+ }
86
+
87
+ template<>
88
+ std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
89
+ {
90
+ Array a = Array(x);
91
+ std::vector<uint64_t> res;
92
+ res.reserve(a.size());
93
+ for (auto const& v : a) {
94
+ res.push_back(from_ruby<uint64_t>(v));
95
+ }
96
+ return res;
97
+ }
98
+
29
99
  extern "C"
30
100
  void Init_ext()
31
101
  {
@@ -55,12 +125,7 @@ void Init_ext()
55
125
  })
56
126
  .define_method(
57
127
  "_add_doc",
58
- *[](tomoto::ILDAModel& self, Array rb_words) {
59
- std::vector<std::string> words;
60
- words.reserve(rb_words.size());
61
- for (auto const& v : rb_words) {
62
- words.push_back(from_ruby<std::string>(v));
63
- }
128
+ *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
64
129
  self.addDoc(words);
65
130
  })
66
131
  .define_method(
@@ -93,6 +158,11 @@ void Init_ext()
93
158
  *[](tomoto::ILDAModel& self) {
94
159
  return self.getEta();
95
160
  })
161
+ .define_method(
162
+ "global_step",
163
+ *[](tomoto::ILDAModel& self) {
164
+ return self.getGlobalStep();
165
+ })
96
166
  .define_method(
97
167
  "k",
98
168
  *[](tomoto::ILDAModel& self) {
@@ -112,15 +182,36 @@ void Init_ext()
112
182
  return self.getLLPerWord();
113
183
  })
114
184
  .define_method(
115
- "num_words",
185
+ "num_docs",
116
186
  *[](tomoto::ILDAModel& self) {
117
- return self.getN();
187
+ return self.getNumDocs();
118
188
  })
119
189
  .define_method(
120
190
  "num_vocabs",
121
191
  *[](tomoto::ILDAModel& self) {
122
192
  return self.getV();
123
193
  })
194
+ .define_method(
195
+ "num_words",
196
+ *[](tomoto::ILDAModel& self) {
197
+ return self.getN();
198
+ })
199
+ .define_method(
200
+ "optim_interval",
201
+ *[](tomoto::ILDAModel& self) {
202
+ return self.getOptimInterval();
203
+ })
204
+ .define_method(
205
+ "optim_interval=",
206
+ *[](tomoto::ILDAModel& self, size_t value) {
207
+ self.setOptimInterval(value);
208
+ return value;
209
+ })
210
+ .define_method(
211
+ "perplexity",
212
+ *[](tomoto::ILDAModel& self) {
213
+ return self.getPerplexity();
214
+ })
124
215
  .define_method(
125
216
  "_prepare",
126
217
  *[](tomoto::ILDAModel& self, size_t minCnt, size_t minDf, size_t rmTop) {
@@ -159,6 +250,62 @@ void Init_ext()
159
250
  size_t ps = 0;
160
251
  self.train(iteration, workers, (tomoto::ParallelScheme)ps);
161
252
  })
253
+ .define_method(
254
+ "_tw",
255
+ *[](tomoto::ILDAModel& self) {
256
+ return (int)self.getTermWeight();
257
+ })
258
+ .define_method(
259
+ "used_vocab_df",
260
+ *[](tomoto::ILDAModel& self) {
261
+ auto vocab = self.getVocabDf();
262
+ Array res;
263
+ for (size_t i = 0; i < self.getV(); i++) {
264
+ res.push(vocab[i]);
265
+ }
266
+ return res;
267
+ })
268
+ .define_method(
269
+ "used_vocab_freq",
270
+ *[](tomoto::ILDAModel& self) {
271
+ auto vocab = self.getVocabCf();
272
+ Array res;
273
+ for (size_t i = 0; i < self.getV(); i++) {
274
+ res.push(vocab[i]);
275
+ }
276
+ return res;
277
+ })
278
+ .define_method(
279
+ "used_vocabs",
280
+ *[](tomoto::ILDAModel& self) {
281
+ auto dict = self.getVocabDict();
282
+ Array res;
283
+ auto utf8 = Class(rb_cEncoding).call("const_get", "UTF_8");
284
+ for (size_t i = 0; i < self.getV(); i++) {
285
+ res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
286
+ }
287
+ return res;
288
+ })
289
+ .define_method(
290
+ "vocab_df",
291
+ *[](tomoto::ILDAModel& self) {
292
+ auto vocab = self.getVocabDf();
293
+ Array res;
294
+ for (size_t i = 0; i < vocab.size(); i++) {
295
+ res.push(vocab[i]);
296
+ }
297
+ return res;
298
+ })
299
+ .define_method(
300
+ "vocab_freq",
301
+ *[](tomoto::ILDAModel& self) {
302
+ auto vocab = self.getVocabCf();
303
+ Array res;
304
+ for (size_t i = 0; i < vocab.size(); i++) {
305
+ res.push(vocab[i]);
306
+ }
307
+ return res;
308
+ })
162
309
  .define_method(
163
310
  "vocabs",
164
311
  *[](tomoto::ILDAModel& self) {
@@ -180,6 +327,11 @@ void Init_ext()
180
327
  }
181
328
  return tomoto::ICTModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
182
329
  })
330
+ .define_method(
331
+ "_correlations",
332
+ *[](tomoto::ICTModel& self, tomoto::Tid topic_id) {
333
+ return self.getCorrelationTopic(topic_id);
334
+ })
183
335
  .define_method(
184
336
  "num_beta_sample",
185
337
  *[](tomoto::ICTModel& self) {
@@ -187,9 +339,9 @@ void Init_ext()
187
339
  })
188
340
  .define_method(
189
341
  "num_beta_sample=",
190
- *[](tomoto::ICTModel& self, size_t numSample) {
191
- self.setNumBetaSample(numSample);
192
- return numSample;
342
+ *[](tomoto::ICTModel& self, size_t value) {
343
+ self.setNumBetaSample(value);
344
+ return value;
193
345
  })
194
346
  .define_method(
195
347
  "num_tmn_sample",
@@ -198,12 +350,12 @@ void Init_ext()
198
350
  })
199
351
  .define_method(
200
352
  "num_tmn_sample=",
201
- *[](tomoto::ICTModel& self, size_t numSample) {
202
- self.setNumTMNSample(numSample);
203
- return numSample;
353
+ *[](tomoto::ICTModel& self, size_t value) {
354
+ self.setNumTMNSample(value);
355
+ return value;
204
356
  })
205
357
  .define_method(
206
- "prior_cov",
358
+ "_prior_cov",
207
359
  *[](tomoto::ICTModel& self) {
208
360
  return self.getPriorCov();
209
361
  })
@@ -213,6 +365,138 @@ void Init_ext()
213
365
  return self.getPriorMean();
214
366
  });
215
367
 
368
+ Class rb_cDMR = define_class_under<tomoto::IDMRModel, tomoto::ILDAModel>(rb_mTomoto, "DMR")
369
+ .define_singleton_method(
370
+ "_new",
371
+ *[](size_t tw, size_t k, float alpha, float sigma, float eta, float alpha_epsilon, int seed) {
372
+ if (seed < 0) {
373
+ seed = std::random_device{}();
374
+ }
375
+ return tomoto::IDMRModel::create((tomoto::TermWeight)tw, k, alpha, sigma, eta, alpha_epsilon, seed);
376
+ })
377
+ .define_method(
378
+ "_add_doc",
379
+ *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<std::string> metadata) {
380
+ self.addDoc(words, metadata);
381
+ })
382
+ .define_method(
383
+ "alpha_epsilon",
384
+ *[](tomoto::IDMRModel& self) {
385
+ return self.getAlphaEps();
386
+ })
387
+ .define_method(
388
+ "alpha_epsilon=",
389
+ *[](tomoto::IDMRModel& self, float value) {
390
+ self.setAlphaEps(value);
391
+ return value;
392
+ })
393
+ .define_method(
394
+ "f",
395
+ *[](tomoto::IDMRModel& self) {
396
+ return self.getF();
397
+ })
398
+ .define_method(
399
+ "_lambdas",
400
+ *[](tomoto::IDMRModel& self, tomoto::Tid topic_id) {
401
+ return self.getLambdaByTopic(topic_id);
402
+ })
403
+ .define_method(
404
+ "metadata_dict",
405
+ *[](tomoto::IDMRModel& self) {
406
+ auto dict = self.getMetadataDict();
407
+ Array res;
408
+ auto utf8 = Class(rb_cEncoding).call("const_get", "UTF_8");
409
+ for (size_t i = 0; i < dict.size(); i++) {
410
+ res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
411
+ }
412
+ return res;
413
+ })
414
+ .define_method(
415
+ "sigma",
416
+ *[](tomoto::IDMRModel& self) {
417
+ return self.getSigma();
418
+ });
419
+
420
+ Class rb_cDT = define_class_under<tomoto::IDTModel, tomoto::ILDAModel>(rb_mTomoto, "DT")
421
+ .define_singleton_method(
422
+ "_new",
423
+ *[](size_t tw, size_t k, size_t t, float alphaVar, float etaVar, float phiVar, float shapeA, float shapeB, float shapeC) {
424
+ // Rice only supports 10 arguments
425
+ int seed = -1;
426
+ if (seed < 0) {
427
+ seed = std::random_device{}();
428
+ }
429
+ return tomoto::IDTModel::create((tomoto::TermWeight)tw, k, t, alphaVar, etaVar, phiVar, shapeA, shapeB, shapeC, 0, seed);
430
+ })
431
+ .define_method(
432
+ "_add_doc",
433
+ *[](tomoto::IDTModel& self, std::vector<std::string> words, size_t timepoint) {
434
+ self.addDoc(words, timepoint);
435
+ })
436
+ .define_method(
437
+ "lr_a",
438
+ *[](tomoto::IDTModel& self) {
439
+ return self.getShapeA();
440
+ })
441
+ .define_method(
442
+ "lr_a=",
443
+ *[](tomoto::IDTModel& self, float value) {
444
+ self.setShapeA(value);
445
+ return value;
446
+ })
447
+ .define_method(
448
+ "lr_b",
449
+ *[](tomoto::IDTModel& self) {
450
+ return self.getShapeB();
451
+ })
452
+ .define_method(
453
+ "lr_b=",
454
+ *[](tomoto::IDTModel& self, float value) {
455
+ self.setShapeB(value);
456
+ return value;
457
+ })
458
+ .define_method(
459
+ "lr_c",
460
+ *[](tomoto::IDTModel& self) {
461
+ return self.getShapeC();
462
+ })
463
+ .define_method(
464
+ "lr_c=",
465
+ *[](tomoto::IDTModel& self, float value) {
466
+ self.setShapeC(value);
467
+ return value;
468
+ })
469
+ .define_method(
470
+ "num_docs_by_timepoint",
471
+ *[](tomoto::IDTModel& self) {
472
+ return self.getNumDocsByT();
473
+ })
474
+ .define_method(
475
+ "num_timepoints",
476
+ *[](tomoto::IDTModel& self) {
477
+ return self.getT();
478
+ });
479
+
480
+ Class rb_cGDMR = define_class_under<tomoto::IGDMRModel, tomoto::IDMRModel>(rb_mTomoto, "GDMR")
481
+ .define_singleton_method(
482
+ "_new",
483
+ *[](size_t tw, size_t k, std::vector<uint64_t> degrees, float alpha, float sigma, float sigma0, float eta, float alpha_epsilon, int seed) {
484
+ if (seed < 0) {
485
+ seed = std::random_device{}();
486
+ }
487
+ return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
488
+ })
489
+ .define_method(
490
+ "degrees",
491
+ *[](tomoto::IGDMRModel& self) {
492
+ return self.getFs();
493
+ })
494
+ .define_method(
495
+ "sigma0",
496
+ *[](tomoto::IGDMRModel& self) {
497
+ return self.getSigma0();
498
+ });
499
+
216
500
  Class rb_cHDP = define_class_under<tomoto::IHDPModel, tomoto::ILDAModel>(rb_mTomoto, "HDP")
217
501
  .define_singleton_method(
218
502
  "_new",
@@ -242,4 +526,217 @@ void Init_ext()
242
526
  *[](tomoto::IHDPModel& self) {
243
527
  return self.getTotalTables();
244
528
  });
529
+
530
+ Class rb_cHLDA = define_class_under<tomoto::IHLDAModel, tomoto::ILDAModel>(rb_mTomoto, "HLDA")
531
+ .define_singleton_method(
532
+ "_new",
533
+ *[](size_t tw, size_t levelDepth, float alpha, float eta, float gamma, int seed) {
534
+ if (seed < 0) {
535
+ seed = std::random_device{}();
536
+ }
537
+ return tomoto::IHLDAModel::create((tomoto::TermWeight)tw, levelDepth, alpha, eta, gamma, seed);
538
+ })
539
+ .define_method(
540
+ "_children_topics",
541
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
542
+ return self.getChildTopicId(topic_id);
543
+ })
544
+ .define_method(
545
+ "depth",
546
+ *[](tomoto::IHLDAModel& self) {
547
+ return self.getLevelDepth();
548
+ })
549
+ .define_method(
550
+ "gamma",
551
+ *[](tomoto::IHLDAModel& self) {
552
+ return self.getGamma();
553
+ })
554
+ .define_method(
555
+ "_level",
556
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
557
+ return self.getLevelOfTopic(topic_id);
558
+ })
559
+ .define_method(
560
+ "live_k",
561
+ *[](tomoto::IHLDAModel& self) {
562
+ return self.getLiveK();
563
+ })
564
+ .define_method(
565
+ "_live_topic?",
566
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
567
+ return self.isLiveTopic(topic_id);
568
+ })
569
+ .define_method(
570
+ "_num_docs_of_topic",
571
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
572
+ return self.getNumDocsOfTopic(topic_id);
573
+ })
574
+ .define_method(
575
+ "_parent_topic",
576
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
577
+ return self.getParentTopicId(topic_id);
578
+ });
579
+
580
+ Class rb_cPA = define_class_under<tomoto::IPAModel, tomoto::ILDAModel>(rb_mTomoto, "PA")
581
+ .define_singleton_method(
582
+ "_new",
583
+ *[](size_t tw, size_t k1, size_t k2, float alpha, float eta, int seed) {
584
+ if (seed < 0) {
585
+ seed = std::random_device{}();
586
+ }
587
+ return tomoto::IPAModel::create((tomoto::TermWeight)tw, k1, k2, alpha, eta, seed);
588
+ })
589
+ .define_method(
590
+ "k1",
591
+ *[](tomoto::IPAModel& self) {
592
+ return self.getK();
593
+ })
594
+ .define_method(
595
+ "k2",
596
+ *[](tomoto::IPAModel& self) {
597
+ return self.getK2();
598
+ });
599
+
600
+ Class rb_cHPA = define_class_under<tomoto::IHPAModel, tomoto::IPAModel>(rb_mTomoto, "HPA")
601
+ .define_singleton_method(
602
+ "_new",
603
+ *[](size_t tw, size_t k1, size_t k2, float alpha, float eta, int seed) {
604
+ if (seed < 0) {
605
+ seed = std::random_device{}();
606
+ }
607
+ return tomoto::IHPAModel::create((tomoto::TermWeight)tw, false, k1, k2, alpha, eta, seed);
608
+ });
609
+
610
+ Class rb_cMGLDA = define_class_under<tomoto::IMGLDAModel, tomoto::ILDAModel>(rb_mTomoto, "MGLDA")
611
+ .define_singleton_method(
612
+ "_new",
613
+ *[](size_t tw, size_t k_g, size_t k_l, size_t t, float alpha_g, float alpha_l, float alpha_mg, float alpha_ml, float eta_g) {
614
+ return tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g);
615
+ })
616
+ .define_method(
617
+ "_add_doc",
618
+ *[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
619
+ self.addDoc(words, delimiter);
620
+ })
621
+ .define_method(
622
+ "alpha_g",
623
+ *[](tomoto::IMGLDAModel& self) {
624
+ return self.getAlpha();
625
+ })
626
+ .define_method(
627
+ "alpha_l",
628
+ *[](tomoto::IMGLDAModel& self) {
629
+ return self.getAlphaL();
630
+ })
631
+ .define_method(
632
+ "alpha_mg",
633
+ *[](tomoto::IMGLDAModel& self) {
634
+ return self.getAlphaM();
635
+ })
636
+ .define_method(
637
+ "alpha_ml",
638
+ *[](tomoto::IMGLDAModel& self) {
639
+ return self.getAlphaML();
640
+ })
641
+ .define_method(
642
+ "eta_g",
643
+ *[](tomoto::IMGLDAModel& self) {
644
+ return self.getEta();
645
+ })
646
+ .define_method(
647
+ "eta_l",
648
+ *[](tomoto::IMGLDAModel& self) {
649
+ return self.getEtaL();
650
+ })
651
+ .define_method(
652
+ "gamma",
653
+ *[](tomoto::IMGLDAModel& self) {
654
+ return self.getGamma();
655
+ })
656
+ .define_method(
657
+ "k_g",
658
+ *[](tomoto::IMGLDAModel& self) {
659
+ return self.getK();
660
+ })
661
+ .define_method(
662
+ "k_l",
663
+ *[](tomoto::IMGLDAModel& self) {
664
+ return self.getKL();
665
+ })
666
+ .define_method(
667
+ "t",
668
+ *[](tomoto::IMGLDAModel& self) {
669
+ return self.getT();
670
+ });
671
+
672
+ Class rb_cLLDA = define_class_under<tomoto::ILLDAModel, tomoto::ILDAModel>(rb_mTomoto, "LLDA")
673
+ .define_singleton_method(
674
+ "_new",
675
+ *[](size_t tw, size_t k, float alpha, float eta, int seed) {
676
+ if (seed < 0) {
677
+ seed = std::random_device{}();
678
+ }
679
+ return tomoto::ILLDAModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
680
+ })
681
+ .define_method(
682
+ "_add_doc",
683
+ *[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
684
+ self.addDoc(words, labels);
685
+ })
686
+ .define_method(
687
+ "topics_per_label",
688
+ *[](tomoto::ILLDAModel& self) {
689
+ return self.getNumTopicsPerLabel();
690
+ });
691
+
692
+ Class rb_cPLDA = define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(rb_mTomoto, "PLDA")
693
+ .define_singleton_method(
694
+ "_new",
695
+ *[](size_t tw, size_t latent_topics, float alpha, float eta, int seed) {
696
+ if (seed < 0) {
697
+ seed = std::random_device{}();
698
+ }
699
+ return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, latent_topics, 1, alpha, eta, seed);
700
+ })
701
+ .define_method(
702
+ "_add_doc",
703
+ *[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
704
+ self.addDoc(words, labels);
705
+ })
706
+ .define_method(
707
+ "latent_topics",
708
+ *[](tomoto::IPLDAModel& self) {
709
+ return self.getNumLatentTopics();
710
+ });
711
+
712
+ Class rb_cSLDA = define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(rb_mTomoto, "SLDA")
713
+ .define_singleton_method(
714
+ "_new",
715
+ *[](size_t tw, size_t k, Array rb_vars, float alpha, float eta, std::vector<float> mu, std::vector<float> nu_sq, std::vector<float> glm_param, int seed) {
716
+ if (seed < 0) {
717
+ seed = std::random_device{}();
718
+ }
719
+ std::vector<tomoto::ISLDAModel::GLM> vars;
720
+ vars.reserve(rb_vars.size());
721
+ for (auto const& v : rb_vars) {
722
+ vars.push_back((tomoto::ISLDAModel::GLM) from_ruby<int>(v));
723
+ }
724
+ return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, k, vars, alpha, eta, mu, nu_sq, glm_param, seed);
725
+ })
726
+ .define_method(
727
+ "_add_doc",
728
+ *[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<float> y) {
729
+ self.addDoc(words, y);
730
+ })
731
+ .define_method(
732
+ "f",
733
+ *[](tomoto::ISLDAModel& self) {
734
+ return self.getF();
735
+ })
736
+ .define_method(
737
+ "_var_type",
738
+ *[](tomoto::ISLDAModel& self, size_t var_id) {
739
+ if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
740
+ return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
741
+ });
245
742
  }
@@ -11,6 +11,9 @@ apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
11
11
  if apple_clang
12
12
  # silence rice warnings
13
13
  $CXXFLAGS += " -Wno-deprecated-declarations"
14
+ else
15
+ # silence eigen warnings
16
+ $CXXFLAGS += " -Wno-ignored-attributes -Wno-deprecated-copy"
14
17
  end
15
18
 
16
19
  # silence tomoto warnings
@@ -3,8 +3,18 @@ require "tomoto/ext"
3
3
 
4
4
  # modules
5
5
  require "tomoto/ct"
6
+ require "tomoto/dmr"
7
+ require "tomoto/dt"
8
+ require "tomoto/gdmr"
6
9
  require "tomoto/hdp"
10
+ require "tomoto/hlda"
11
+ require "tomoto/hpa"
7
12
  require "tomoto/lda"
13
+ require "tomoto/llda"
14
+ require "tomoto/mglda"
15
+ require "tomoto/pa"
16
+ require "tomoto/plda"
17
+ require "tomoto/slda"
8
18
  require "tomoto/version"
9
19
 
10
20
  module Tomoto
@@ -7,5 +7,18 @@ module Tomoto
7
7
  model.instance_variable_set(:@rm_top, rm_top)
8
8
  model
9
9
  end
10
+
11
+ def correlations(topic_id = nil)
12
+ prepare
13
+ if topic_id
14
+ _correlations(topic_id)
15
+ else
16
+ k.times.map { |i| _correlations(i) }
17
+ end
18
+ end
19
+
20
+ def prior_cov
21
+ _prior_cov.each_slice(k).to_a
22
+ end
10
23
  end
11
24
  end
@@ -0,0 +1,23 @@
1
+ module Tomoto
2
+ class DMR
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, alpha: 0.1, eta: 0.01, sigma: 1.0, alpha_epsilon: 1e-10, seed: nil)
4
+ model = _new(to_tw(tw), k, alpha, sigma, eta, alpha_epsilon, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, metadata: "")
12
+ _add_doc(prepare_doc(doc), [metadata])
13
+ end
14
+
15
+ def lambdas
16
+ if f == 0
17
+ []
18
+ else
19
+ k.times.map { |i| _lambdas(i) }
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class DT
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, t: 1, alpha_var: 0.1, eta_var: 0.1, phi_var: 0.1, lr_a: 0.01, lr_b: 0.1, lr_c: 0.55) #, seed: nil)
4
+ model = _new(to_tw(tw), k, t, alpha_var, eta_var, phi_var, lr_a, lr_b, lr_c)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, timepoint: 0)
12
+ _add_doc(prepare_doc(doc), timepoint)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class GDMR
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, degrees: [], alpha: 0.1, eta: 0.01, sigma: 1.0, sigma0: 3.0, alpha_epsilon: 1e-10, seed: nil)
4
+ model = _new(to_tw(tw), k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, metadata: [])
12
+ _add_doc(prepare_doc(doc), metadata.map(&:to_s))
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,43 @@
1
+ module Tomoto
2
+ class HLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, depth: 2, alpha: 0.1, eta: 0.01, gamma: 0.1, seed: nil)
4
+ model = _new(to_tw(tw), depth, alpha, eta, gamma, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def children_topics(topic_id)
12
+ check_topic(topic_id)
13
+ _children_topics(topic_id)
14
+ end
15
+
16
+ def level(topic_id)
17
+ check_topic(topic_id)
18
+ _live_topic?(topic_id) ? _level(topic_id) : -1
19
+ end
20
+
21
+ def live_topic?(topic_id)
22
+ check_topic(topic_id)
23
+ _live_topic?(topic_id)
24
+ end
25
+
26
+ def num_docs_of_topic(topic_id)
27
+ check_topic(topic_id)
28
+ _num_docs_of_topic(topic_id)
29
+ end
30
+
31
+ def parent_topic(topic_id)
32
+ check_topic(topic_id)
33
+ _live_topic?(topic_id) ? _parent_topic(topic_id) : -1
34
+ end
35
+
36
+ private
37
+
38
+ def check_topic(topic_id)
39
+ raise "topic_id must be < K" if topic_id >= k
40
+ raise "train() should be called first" unless @prepared
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,11 @@
1
+ module Tomoto
2
+ class HPA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k1: 1, k2: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), k1, k2, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+ end
11
+ end
@@ -15,9 +15,7 @@ module Tomoto
15
15
  end
16
16
 
17
17
  def add_doc(doc)
18
- raise "cannot add_doc() after train()" if defined?(@prepared)
19
- doc = doc.split(/[[:space:]]+/) unless doc.is_a?(Array)
20
- _add_doc(doc)
18
+ _add_doc(prepare_doc(doc))
21
19
  end
22
20
 
23
21
  def count_by_topics
@@ -47,6 +45,10 @@ module Tomoto
47
45
  _train(iterations, workers)
48
46
  end
49
47
 
48
+ def tw
49
+ TERM_WEIGHT[_tw]
50
+ end
51
+
50
52
  private
51
53
 
52
54
  def prepare
@@ -56,6 +58,12 @@ module Tomoto
56
58
  end
57
59
  end
58
60
 
61
+ def prepare_doc(doc)
62
+ raise "cannot add_doc() after train()" if defined?(@prepared)
63
+ doc = doc.split(/[[:space:]]+/) unless doc.is_a?(Array)
64
+ doc
65
+ end
66
+
59
67
  class << self
60
68
  private
61
69
 
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class LLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), k, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, labels: [])
12
+ _add_doc(prepare_doc(doc), labels)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class MGLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k_g: 1, k_l: 1, t: 3, alpha_g: 0.1, alpha_l: 0.1, alpha_mg: 0.1, alpha_ml: 0.1, eta_g: 0.01) #, eta_l: 0.01, gamma: 0.1, seed: nil)
4
+ model = _new(to_tw(tw), k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, delimiter: ".")
12
+ _add_doc(prepare_doc(doc), delimiter)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,11 @@
1
+ module Tomoto
2
+ class PA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k1: 1, k2: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), k1, k2, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class PLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, latent_topics: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), latent_topics, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, labels: [])
12
+ _add_doc(prepare_doc(doc), labels)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,37 @@
1
+ module Tomoto
2
+ class SLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, vars: "", alpha: 0.1, eta: 0.01, mu: [], nu_sq: [], glm_param: [], seed: nil)
4
+ model = _new(to_tw(tw), k, vars.split("").map { |v| to_glm(v) }, alpha, eta, mu, nu_sq, glm_param, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, y: [])
12
+ _add_doc(prepare_doc(doc), y)
13
+ end
14
+
15
+ def var_type(var_id)
16
+ raise "train() should be called first" unless @prepared
17
+ _var_type(var_id)
18
+ end
19
+
20
+ private
21
+
22
+ class << self
23
+ private
24
+
25
+ def to_glm(v)
26
+ case v
27
+ when "l"
28
+ 0
29
+ when "b"
30
+ 1
31
+ else
32
+ raise "Invalid var: #{v}"
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -1,3 +1,3 @@
1
1
  module Tomoto
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomoto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-09 00:00:00.000000000 Z
11
+ date: 2020-10-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -94,8 +94,18 @@ files:
94
94
  - ext/tomoto/extconf.rb
95
95
  - lib/tomoto.rb
96
96
  - lib/tomoto/ct.rb
97
+ - lib/tomoto/dmr.rb
98
+ - lib/tomoto/dt.rb
99
+ - lib/tomoto/gdmr.rb
97
100
  - lib/tomoto/hdp.rb
101
+ - lib/tomoto/hlda.rb
102
+ - lib/tomoto/hpa.rb
98
103
  - lib/tomoto/lda.rb
104
+ - lib/tomoto/llda.rb
105
+ - lib/tomoto/mglda.rb
106
+ - lib/tomoto/pa.rb
107
+ - lib/tomoto/plda.rb
108
+ - lib/tomoto/slda.rb
99
109
  - lib/tomoto/version.rb
100
110
  - vendor/EigenRand/EigenRand/Core.h
101
111
  - vendor/EigenRand/EigenRand/Dists/Basic.h