tomoto 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a27c5c0ff4a71b0e0b084773adf7a2a0ede39152b210973787c12e98637cb7d3
4
- data.tar.gz: c70cabfa9e8685e86edae56c4b50c52a8bb6baf4d3f7684c3e28d7345e460551
3
+ metadata.gz: 3b40c9adf2f0162eb6174b17395ea37b9294e14b22609e9f51951e9904125ff9
4
+ data.tar.gz: be3f68438f60a7e4fc11033921636f8d03bf411bd3d3eb6aa3b4fb448faac41a
5
5
  SHA512:
6
- metadata.gz: d23a02abb149799facf1b557004ebc2d749131d37eb33e8f70e8aa109f117cc79a3db847326206fb08d5114b983959fa782082c9fe239e3573afe363d81f5066
7
- data.tar.gz: 6ca2548b92c30adea217437dfad8a5e0ef802c2789a21cd5d40d88514607889b854afa6a2b7e1ef06f7ecabaa1226655c0dfc29aaa5439993861c2895542ae98
6
+ metadata.gz: a74747ae372d030c42562d4e2b99ab167ccc28533468ed08819f4bd34d42b340349870712c12e565388eb7833f993349432e77baee8618c34c265676ca181072
7
+ data.tar.gz: da9e833bb98726278108a68a7dd6bed0e54b3979c25d49d8db01aa613e6205a6a3512881688209baf2589c4dc5514ed2663fb251a5e273c42b678bb1daa06d74
@@ -1,3 +1,7 @@
1
+ ## 0.1.1 (2020-10-10)
2
+
3
+ - Added many more models
4
+
1
5
  ## 0.1.0 (2020-10-09)
2
6
 
3
7
  - First release
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
- # Tomoto
1
+ # tomoto
2
2
 
3
- [Tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
3
+ :tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
4
+
5
+ [![Build Status](https://travis-ci.org/ankane/tomoto.svg?branch=master)](https://travis-ci.org/ankane/tomoto)
4
6
 
5
7
  ## Installation
6
8
 
@@ -10,7 +12,7 @@ Add this line to your application’s Gemfile:
10
12
  gem 'tomoto'
11
13
  ```
12
14
 
13
- It can take around 10 minutes to compile the extension.
15
+ It can take 10-20 minutes to compile the extension.
14
16
 
15
17
  ## Getting Started
16
18
 
@@ -65,23 +67,27 @@ model.ll_per_word
65
67
  Supports:
66
68
 
67
69
  - Latent Dirichlet Allocation (`LDA`)
70
+ - Labeled LDA (`LLDA`)
71
+ - Partially Labeled LDA (`PLDA`)
72
+ - Supervised LDA (`SLDA`)
73
+ - Dirichlet Multinomial Regression (`DMR`)
74
+ - Generalized Dirichlet Multinomial Regression (`GDMR`)
68
75
  - Hierarchical Dirichlet Process (`HDP`)
76
+ - Hierarchical LDA (`HLDA`)
77
+ - Multi Grain LDA (`MGLDA`)
78
+ - Pachinko Allocation (`PA`)
79
+ - Hierarchical PA (`HPA`)
69
80
  - Correlated Topic Model (`CT`)
81
+ - Dynamic Topic Model (`DT`)
70
82
 
71
- ## Parameters
83
+ ## API
72
84
 
73
- ```ruby
74
- Tomoto::LDA.new(
75
- tw: :one, # or :idf, :pmi
76
- min_cf: 0,
77
- min_df: 0,
78
- rm_top: 0,
79
- k: 1,
80
- alpha: 0.1,
81
- eta: 0.01,
82
- seed: nil
83
- )
84
- ```
85
+ This library follows the [tomotopy API](https://bab2min.github.io/tomotopy/v0.9.0/en/). There are a few changes to make it more Ruby-like:
86
+
87
+ - The `get_` prefix has been removed from methods (`topic_words` instead of `get_topic_words`)
88
+ - Methods that return booleans use `?` instead of `is_` (`live_topic?` instead of `is_live_topic`)
89
+
90
+ If a method or option you need isn’t supported, feel free to open an issue.
85
91
 
86
92
  ## Tokenization
87
93
 
@@ -93,7 +99,7 @@ model.add_doc(["tokens", "from", "document", "one"])
93
99
 
94
100
  ## Performance
95
101
 
96
- Tomoto uses AVX2, AVX, or SSE2 instructions to increase performance on machines that support it. Check what it’s using with:
102
+ tomoto uses AVX2, AVX, or SSE2 instructions to increase performance on machines that support it. Check what it’s using with:
97
103
 
98
104
  ```ruby
99
105
  Tomoto.isa
@@ -1,7 +1,21 @@
1
+ // stdlib
2
+ #include <fstream>
3
+ #include <iostream>
4
+
1
5
  // tomoto
2
6
  #include <CT.h>
7
+ #include <DMR.h>
8
+ #include <DT.h>
9
+ #include <GDMR.h>
3
10
  #include <HDP.h>
11
+ #include <HLDA.h>
12
+ #include <HPA.h>
4
13
  #include <LDA.h>
14
+ #include <LLDA.h>
15
+ #include <MGLDA.h>
16
+ #include <PA.h>
17
+ #include <PLDA.h>
18
+ #include <SLDA.h>
5
19
 
6
20
  // rice
7
21
  #include <rice/Array.hpp>
@@ -26,6 +40,62 @@ Object to_ruby<std::vector<float>>(std::vector<float> const & x)
26
40
  return res;
27
41
  }
28
42
 
43
+ template<>
44
+ Object to_ruby<std::vector<uint32_t>>(std::vector<uint32_t> const & x)
45
+ {
46
+ Array res;
47
+ for (auto const& v : x) {
48
+ res.push(v);
49
+ }
50
+ return res;
51
+ }
52
+
53
+ template<>
54
+ Object to_ruby<std::vector<uint64_t>>(std::vector<uint64_t> const & x)
55
+ {
56
+ Array res;
57
+ for (auto const& v : x) {
58
+ res.push(v);
59
+ }
60
+ return res;
61
+ }
62
+
63
+ template<>
64
+ std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
65
+ {
66
+ Array a = Array(x);
67
+ std::vector<std::string> res;
68
+ res.reserve(a.size());
69
+ for (auto const& v : a) {
70
+ res.push_back(from_ruby<std::string>(v));
71
+ }
72
+ return res;
73
+ }
74
+
75
+ template<>
76
+ std::vector<float> from_ruby<std::vector<float>>(Object x)
77
+ {
78
+ Array a = Array(x);
79
+ std::vector<float> res;
80
+ res.reserve(a.size());
81
+ for (auto const& v : a) {
82
+ res.push_back(from_ruby<float>(v));
83
+ }
84
+ return res;
85
+ }
86
+
87
+ template<>
88
+ std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
89
+ {
90
+ Array a = Array(x);
91
+ std::vector<uint64_t> res;
92
+ res.reserve(a.size());
93
+ for (auto const& v : a) {
94
+ res.push_back(from_ruby<uint64_t>(v));
95
+ }
96
+ return res;
97
+ }
98
+
29
99
  extern "C"
30
100
  void Init_ext()
31
101
  {
@@ -55,12 +125,7 @@ void Init_ext()
55
125
  })
56
126
  .define_method(
57
127
  "_add_doc",
58
- *[](tomoto::ILDAModel& self, Array rb_words) {
59
- std::vector<std::string> words;
60
- words.reserve(rb_words.size());
61
- for (auto const& v : rb_words) {
62
- words.push_back(from_ruby<std::string>(v));
63
- }
128
+ *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
64
129
  self.addDoc(words);
65
130
  })
66
131
  .define_method(
@@ -93,6 +158,11 @@ void Init_ext()
93
158
  *[](tomoto::ILDAModel& self) {
94
159
  return self.getEta();
95
160
  })
161
+ .define_method(
162
+ "global_step",
163
+ *[](tomoto::ILDAModel& self) {
164
+ return self.getGlobalStep();
165
+ })
96
166
  .define_method(
97
167
  "k",
98
168
  *[](tomoto::ILDAModel& self) {
@@ -112,15 +182,36 @@ void Init_ext()
112
182
  return self.getLLPerWord();
113
183
  })
114
184
  .define_method(
115
- "num_words",
185
+ "num_docs",
116
186
  *[](tomoto::ILDAModel& self) {
117
- return self.getN();
187
+ return self.getNumDocs();
118
188
  })
119
189
  .define_method(
120
190
  "num_vocabs",
121
191
  *[](tomoto::ILDAModel& self) {
122
192
  return self.getV();
123
193
  })
194
+ .define_method(
195
+ "num_words",
196
+ *[](tomoto::ILDAModel& self) {
197
+ return self.getN();
198
+ })
199
+ .define_method(
200
+ "optim_interval",
201
+ *[](tomoto::ILDAModel& self) {
202
+ return self.getOptimInterval();
203
+ })
204
+ .define_method(
205
+ "optim_interval=",
206
+ *[](tomoto::ILDAModel& self, size_t value) {
207
+ self.setOptimInterval(value);
208
+ return value;
209
+ })
210
+ .define_method(
211
+ "perplexity",
212
+ *[](tomoto::ILDAModel& self) {
213
+ return self.getPerplexity();
214
+ })
124
215
  .define_method(
125
216
  "_prepare",
126
217
  *[](tomoto::ILDAModel& self, size_t minCnt, size_t minDf, size_t rmTop) {
@@ -159,6 +250,62 @@ void Init_ext()
159
250
  size_t ps = 0;
160
251
  self.train(iteration, workers, (tomoto::ParallelScheme)ps);
161
252
  })
253
+ .define_method(
254
+ "_tw",
255
+ *[](tomoto::ILDAModel& self) {
256
+ return (int)self.getTermWeight();
257
+ })
258
+ .define_method(
259
+ "used_vocab_df",
260
+ *[](tomoto::ILDAModel& self) {
261
+ auto vocab = self.getVocabDf();
262
+ Array res;
263
+ for (size_t i = 0; i < self.getV(); i++) {
264
+ res.push(vocab[i]);
265
+ }
266
+ return res;
267
+ })
268
+ .define_method(
269
+ "used_vocab_freq",
270
+ *[](tomoto::ILDAModel& self) {
271
+ auto vocab = self.getVocabCf();
272
+ Array res;
273
+ for (size_t i = 0; i < self.getV(); i++) {
274
+ res.push(vocab[i]);
275
+ }
276
+ return res;
277
+ })
278
+ .define_method(
279
+ "used_vocabs",
280
+ *[](tomoto::ILDAModel& self) {
281
+ auto dict = self.getVocabDict();
282
+ Array res;
283
+ auto utf8 = Class(rb_cEncoding).call("const_get", "UTF_8");
284
+ for (size_t i = 0; i < self.getV(); i++) {
285
+ res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
286
+ }
287
+ return res;
288
+ })
289
+ .define_method(
290
+ "vocab_df",
291
+ *[](tomoto::ILDAModel& self) {
292
+ auto vocab = self.getVocabDf();
293
+ Array res;
294
+ for (size_t i = 0; i < vocab.size(); i++) {
295
+ res.push(vocab[i]);
296
+ }
297
+ return res;
298
+ })
299
+ .define_method(
300
+ "vocab_freq",
301
+ *[](tomoto::ILDAModel& self) {
302
+ auto vocab = self.getVocabCf();
303
+ Array res;
304
+ for (size_t i = 0; i < vocab.size(); i++) {
305
+ res.push(vocab[i]);
306
+ }
307
+ return res;
308
+ })
162
309
  .define_method(
163
310
  "vocabs",
164
311
  *[](tomoto::ILDAModel& self) {
@@ -180,6 +327,11 @@ void Init_ext()
180
327
  }
181
328
  return tomoto::ICTModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
182
329
  })
330
+ .define_method(
331
+ "_correlations",
332
+ *[](tomoto::ICTModel& self, tomoto::Tid topic_id) {
333
+ return self.getCorrelationTopic(topic_id);
334
+ })
183
335
  .define_method(
184
336
  "num_beta_sample",
185
337
  *[](tomoto::ICTModel& self) {
@@ -187,9 +339,9 @@ void Init_ext()
187
339
  })
188
340
  .define_method(
189
341
  "num_beta_sample=",
190
- *[](tomoto::ICTModel& self, size_t numSample) {
191
- self.setNumBetaSample(numSample);
192
- return numSample;
342
+ *[](tomoto::ICTModel& self, size_t value) {
343
+ self.setNumBetaSample(value);
344
+ return value;
193
345
  })
194
346
  .define_method(
195
347
  "num_tmn_sample",
@@ -198,12 +350,12 @@ void Init_ext()
198
350
  })
199
351
  .define_method(
200
352
  "num_tmn_sample=",
201
- *[](tomoto::ICTModel& self, size_t numSample) {
202
- self.setNumTMNSample(numSample);
203
- return numSample;
353
+ *[](tomoto::ICTModel& self, size_t value) {
354
+ self.setNumTMNSample(value);
355
+ return value;
204
356
  })
205
357
  .define_method(
206
- "prior_cov",
358
+ "_prior_cov",
207
359
  *[](tomoto::ICTModel& self) {
208
360
  return self.getPriorCov();
209
361
  })
@@ -213,6 +365,138 @@ void Init_ext()
213
365
  return self.getPriorMean();
214
366
  });
215
367
 
368
+ Class rb_cDMR = define_class_under<tomoto::IDMRModel, tomoto::ILDAModel>(rb_mTomoto, "DMR")
369
+ .define_singleton_method(
370
+ "_new",
371
+ *[](size_t tw, size_t k, float alpha, float sigma, float eta, float alpha_epsilon, int seed) {
372
+ if (seed < 0) {
373
+ seed = std::random_device{}();
374
+ }
375
+ return tomoto::IDMRModel::create((tomoto::TermWeight)tw, k, alpha, sigma, eta, alpha_epsilon, seed);
376
+ })
377
+ .define_method(
378
+ "_add_doc",
379
+ *[](tomoto::IDMRModel& self, std::vector<std::string> words, std::vector<std::string> metadata) {
380
+ self.addDoc(words, metadata);
381
+ })
382
+ .define_method(
383
+ "alpha_epsilon",
384
+ *[](tomoto::IDMRModel& self) {
385
+ return self.getAlphaEps();
386
+ })
387
+ .define_method(
388
+ "alpha_epsilon=",
389
+ *[](tomoto::IDMRModel& self, float value) {
390
+ self.setAlphaEps(value);
391
+ return value;
392
+ })
393
+ .define_method(
394
+ "f",
395
+ *[](tomoto::IDMRModel& self) {
396
+ return self.getF();
397
+ })
398
+ .define_method(
399
+ "_lambdas",
400
+ *[](tomoto::IDMRModel& self, tomoto::Tid topic_id) {
401
+ return self.getLambdaByTopic(topic_id);
402
+ })
403
+ .define_method(
404
+ "metadata_dict",
405
+ *[](tomoto::IDMRModel& self) {
406
+ auto dict = self.getMetadataDict();
407
+ Array res;
408
+ auto utf8 = Class(rb_cEncoding).call("const_get", "UTF_8");
409
+ for (size_t i = 0; i < dict.size(); i++) {
410
+ res.push(to_ruby<std::string>(dict.toWord(i)).call("force_encoding", utf8));
411
+ }
412
+ return res;
413
+ })
414
+ .define_method(
415
+ "sigma",
416
+ *[](tomoto::IDMRModel& self) {
417
+ return self.getSigma();
418
+ });
419
+
420
+ Class rb_cDT = define_class_under<tomoto::IDTModel, tomoto::ILDAModel>(rb_mTomoto, "DT")
421
+ .define_singleton_method(
422
+ "_new",
423
+ *[](size_t tw, size_t k, size_t t, float alphaVar, float etaVar, float phiVar, float shapeA, float shapeB, float shapeC) {
424
+ // Rice only supports 10 arguments
425
+ int seed = -1;
426
+ if (seed < 0) {
427
+ seed = std::random_device{}();
428
+ }
429
+ return tomoto::IDTModel::create((tomoto::TermWeight)tw, k, t, alphaVar, etaVar, phiVar, shapeA, shapeB, shapeC, 0, seed);
430
+ })
431
+ .define_method(
432
+ "_add_doc",
433
+ *[](tomoto::IDTModel& self, std::vector<std::string> words, size_t timepoint) {
434
+ self.addDoc(words, timepoint);
435
+ })
436
+ .define_method(
437
+ "lr_a",
438
+ *[](tomoto::IDTModel& self) {
439
+ return self.getShapeA();
440
+ })
441
+ .define_method(
442
+ "lr_a=",
443
+ *[](tomoto::IDTModel& self, float value) {
444
+ self.setShapeA(value);
445
+ return value;
446
+ })
447
+ .define_method(
448
+ "lr_b",
449
+ *[](tomoto::IDTModel& self) {
450
+ return self.getShapeB();
451
+ })
452
+ .define_method(
453
+ "lr_b=",
454
+ *[](tomoto::IDTModel& self, float value) {
455
+ self.setShapeB(value);
456
+ return value;
457
+ })
458
+ .define_method(
459
+ "lr_c",
460
+ *[](tomoto::IDTModel& self) {
461
+ return self.getShapeC();
462
+ })
463
+ .define_method(
464
+ "lr_c=",
465
+ *[](tomoto::IDTModel& self, float value) {
466
+ self.setShapeC(value);
467
+ return value;
468
+ })
469
+ .define_method(
470
+ "num_docs_by_timepoint",
471
+ *[](tomoto::IDTModel& self) {
472
+ return self.getNumDocsByT();
473
+ })
474
+ .define_method(
475
+ "num_timepoints",
476
+ *[](tomoto::IDTModel& self) {
477
+ return self.getT();
478
+ });
479
+
480
+ Class rb_cGDMR = define_class_under<tomoto::IGDMRModel, tomoto::IDMRModel>(rb_mTomoto, "GDMR")
481
+ .define_singleton_method(
482
+ "_new",
483
+ *[](size_t tw, size_t k, std::vector<uint64_t> degrees, float alpha, float sigma, float sigma0, float eta, float alpha_epsilon, int seed) {
484
+ if (seed < 0) {
485
+ seed = std::random_device{}();
486
+ }
487
+ return tomoto::IGDMRModel::create((tomoto::TermWeight)tw, k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed);
488
+ })
489
+ .define_method(
490
+ "degrees",
491
+ *[](tomoto::IGDMRModel& self) {
492
+ return self.getFs();
493
+ })
494
+ .define_method(
495
+ "sigma0",
496
+ *[](tomoto::IGDMRModel& self) {
497
+ return self.getSigma0();
498
+ });
499
+
216
500
  Class rb_cHDP = define_class_under<tomoto::IHDPModel, tomoto::ILDAModel>(rb_mTomoto, "HDP")
217
501
  .define_singleton_method(
218
502
  "_new",
@@ -242,4 +526,217 @@ void Init_ext()
242
526
  *[](tomoto::IHDPModel& self) {
243
527
  return self.getTotalTables();
244
528
  });
529
+
530
+ Class rb_cHLDA = define_class_under<tomoto::IHLDAModel, tomoto::ILDAModel>(rb_mTomoto, "HLDA")
531
+ .define_singleton_method(
532
+ "_new",
533
+ *[](size_t tw, size_t levelDepth, float alpha, float eta, float gamma, int seed) {
534
+ if (seed < 0) {
535
+ seed = std::random_device{}();
536
+ }
537
+ return tomoto::IHLDAModel::create((tomoto::TermWeight)tw, levelDepth, alpha, eta, gamma, seed);
538
+ })
539
+ .define_method(
540
+ "_children_topics",
541
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
542
+ return self.getChildTopicId(topic_id);
543
+ })
544
+ .define_method(
545
+ "depth",
546
+ *[](tomoto::IHLDAModel& self) {
547
+ return self.getLevelDepth();
548
+ })
549
+ .define_method(
550
+ "gamma",
551
+ *[](tomoto::IHLDAModel& self) {
552
+ return self.getGamma();
553
+ })
554
+ .define_method(
555
+ "_level",
556
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
557
+ return self.getLevelOfTopic(topic_id);
558
+ })
559
+ .define_method(
560
+ "live_k",
561
+ *[](tomoto::IHLDAModel& self) {
562
+ return self.getLiveK();
563
+ })
564
+ .define_method(
565
+ "_live_topic?",
566
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
567
+ return self.isLiveTopic(topic_id);
568
+ })
569
+ .define_method(
570
+ "_num_docs_of_topic",
571
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
572
+ return self.getNumDocsOfTopic(topic_id);
573
+ })
574
+ .define_method(
575
+ "_parent_topic",
576
+ *[](tomoto::IHLDAModel& self, tomoto::Tid topic_id) {
577
+ return self.getParentTopicId(topic_id);
578
+ });
579
+
580
+ Class rb_cPA = define_class_under<tomoto::IPAModel, tomoto::ILDAModel>(rb_mTomoto, "PA")
581
+ .define_singleton_method(
582
+ "_new",
583
+ *[](size_t tw, size_t k1, size_t k2, float alpha, float eta, int seed) {
584
+ if (seed < 0) {
585
+ seed = std::random_device{}();
586
+ }
587
+ return tomoto::IPAModel::create((tomoto::TermWeight)tw, k1, k2, alpha, eta, seed);
588
+ })
589
+ .define_method(
590
+ "k1",
591
+ *[](tomoto::IPAModel& self) {
592
+ return self.getK();
593
+ })
594
+ .define_method(
595
+ "k2",
596
+ *[](tomoto::IPAModel& self) {
597
+ return self.getK2();
598
+ });
599
+
600
+ Class rb_cHPA = define_class_under<tomoto::IHPAModel, tomoto::IPAModel>(rb_mTomoto, "HPA")
601
+ .define_singleton_method(
602
+ "_new",
603
+ *[](size_t tw, size_t k1, size_t k2, float alpha, float eta, int seed) {
604
+ if (seed < 0) {
605
+ seed = std::random_device{}();
606
+ }
607
+ return tomoto::IHPAModel::create((tomoto::TermWeight)tw, false, k1, k2, alpha, eta, seed);
608
+ });
609
+
610
+ Class rb_cMGLDA = define_class_under<tomoto::IMGLDAModel, tomoto::ILDAModel>(rb_mTomoto, "MGLDA")
611
+ .define_singleton_method(
612
+ "_new",
613
+ *[](size_t tw, size_t k_g, size_t k_l, size_t t, float alpha_g, float alpha_l, float alpha_mg, float alpha_ml, float eta_g) {
614
+ return tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g);
615
+ })
616
+ .define_method(
617
+ "_add_doc",
618
+ *[](tomoto::IMGLDAModel& self, std::vector<std::string> words, std::string delimiter) {
619
+ self.addDoc(words, delimiter);
620
+ })
621
+ .define_method(
622
+ "alpha_g",
623
+ *[](tomoto::IMGLDAModel& self) {
624
+ return self.getAlpha();
625
+ })
626
+ .define_method(
627
+ "alpha_l",
628
+ *[](tomoto::IMGLDAModel& self) {
629
+ return self.getAlphaL();
630
+ })
631
+ .define_method(
632
+ "alpha_mg",
633
+ *[](tomoto::IMGLDAModel& self) {
634
+ return self.getAlphaM();
635
+ })
636
+ .define_method(
637
+ "alpha_ml",
638
+ *[](tomoto::IMGLDAModel& self) {
639
+ return self.getAlphaML();
640
+ })
641
+ .define_method(
642
+ "eta_g",
643
+ *[](tomoto::IMGLDAModel& self) {
644
+ return self.getEta();
645
+ })
646
+ .define_method(
647
+ "eta_l",
648
+ *[](tomoto::IMGLDAModel& self) {
649
+ return self.getEtaL();
650
+ })
651
+ .define_method(
652
+ "gamma",
653
+ *[](tomoto::IMGLDAModel& self) {
654
+ return self.getGamma();
655
+ })
656
+ .define_method(
657
+ "k_g",
658
+ *[](tomoto::IMGLDAModel& self) {
659
+ return self.getK();
660
+ })
661
+ .define_method(
662
+ "k_l",
663
+ *[](tomoto::IMGLDAModel& self) {
664
+ return self.getKL();
665
+ })
666
+ .define_method(
667
+ "t",
668
+ *[](tomoto::IMGLDAModel& self) {
669
+ return self.getT();
670
+ });
671
+
672
+ Class rb_cLLDA = define_class_under<tomoto::ILLDAModel, tomoto::ILDAModel>(rb_mTomoto, "LLDA")
673
+ .define_singleton_method(
674
+ "_new",
675
+ *[](size_t tw, size_t k, float alpha, float eta, int seed) {
676
+ if (seed < 0) {
677
+ seed = std::random_device{}();
678
+ }
679
+ return tomoto::ILLDAModel::create((tomoto::TermWeight)tw, k, alpha, eta, seed);
680
+ })
681
+ .define_method(
682
+ "_add_doc",
683
+ *[](tomoto::ILLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
684
+ self.addDoc(words, labels);
685
+ })
686
+ .define_method(
687
+ "topics_per_label",
688
+ *[](tomoto::ILLDAModel& self) {
689
+ return self.getNumTopicsPerLabel();
690
+ });
691
+
692
+ Class rb_cPLDA = define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(rb_mTomoto, "PLDA")
693
+ .define_singleton_method(
694
+ "_new",
695
+ *[](size_t tw, size_t latent_topics, float alpha, float eta, int seed) {
696
+ if (seed < 0) {
697
+ seed = std::random_device{}();
698
+ }
699
+ return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, latent_topics, 1, alpha, eta, seed);
700
+ })
701
+ .define_method(
702
+ "_add_doc",
703
+ *[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
704
+ self.addDoc(words, labels);
705
+ })
706
+ .define_method(
707
+ "latent_topics",
708
+ *[](tomoto::IPLDAModel& self) {
709
+ return self.getNumLatentTopics();
710
+ });
711
+
712
+ Class rb_cSLDA = define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(rb_mTomoto, "SLDA")
713
+ .define_singleton_method(
714
+ "_new",
715
+ *[](size_t tw, size_t k, Array rb_vars, float alpha, float eta, std::vector<float> mu, std::vector<float> nu_sq, std::vector<float> glm_param, int seed) {
716
+ if (seed < 0) {
717
+ seed = std::random_device{}();
718
+ }
719
+ std::vector<tomoto::ISLDAModel::GLM> vars;
720
+ vars.reserve(rb_vars.size());
721
+ for (auto const& v : rb_vars) {
722
+ vars.push_back((tomoto::ISLDAModel::GLM) from_ruby<int>(v));
723
+ }
724
+ return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, k, vars, alpha, eta, mu, nu_sq, glm_param, seed);
725
+ })
726
+ .define_method(
727
+ "_add_doc",
728
+ *[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<float> y) {
729
+ self.addDoc(words, y);
730
+ })
731
+ .define_method(
732
+ "f",
733
+ *[](tomoto::ISLDAModel& self) {
734
+ return self.getF();
735
+ })
736
+ .define_method(
737
+ "_var_type",
738
+ *[](tomoto::ISLDAModel& self, size_t var_id) {
739
+ if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
740
+ return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
741
+ });
245
742
  }
@@ -11,6 +11,9 @@ apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
11
11
  if apple_clang
12
12
  # silence rice warnings
13
13
  $CXXFLAGS += " -Wno-deprecated-declarations"
14
+ else
15
+ # silence eigen warnings
16
+ $CXXFLAGS += " -Wno-ignored-attributes -Wno-deprecated-copy"
14
17
  end
15
18
 
16
19
  # silence tomoto warnings
@@ -3,8 +3,18 @@ require "tomoto/ext"
3
3
 
4
4
  # modules
5
5
  require "tomoto/ct"
6
+ require "tomoto/dmr"
7
+ require "tomoto/dt"
8
+ require "tomoto/gdmr"
6
9
  require "tomoto/hdp"
10
+ require "tomoto/hlda"
11
+ require "tomoto/hpa"
7
12
  require "tomoto/lda"
13
+ require "tomoto/llda"
14
+ require "tomoto/mglda"
15
+ require "tomoto/pa"
16
+ require "tomoto/plda"
17
+ require "tomoto/slda"
8
18
  require "tomoto/version"
9
19
 
10
20
  module Tomoto
@@ -7,5 +7,18 @@ module Tomoto
7
7
  model.instance_variable_set(:@rm_top, rm_top)
8
8
  model
9
9
  end
10
+
11
+ def correlations(topic_id = nil)
12
+ prepare
13
+ if topic_id
14
+ _correlations(topic_id)
15
+ else
16
+ k.times.map { |i| _correlations(i) }
17
+ end
18
+ end
19
+
20
+ def prior_cov
21
+ _prior_cov.each_slice(k).to_a
22
+ end
10
23
  end
11
24
  end
@@ -0,0 +1,23 @@
1
+ module Tomoto
2
+ class DMR
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, alpha: 0.1, eta: 0.01, sigma: 1.0, alpha_epsilon: 1e-10, seed: nil)
4
+ model = _new(to_tw(tw), k, alpha, sigma, eta, alpha_epsilon, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, metadata: "")
12
+ _add_doc(prepare_doc(doc), [metadata])
13
+ end
14
+
15
+ def lambdas
16
+ if f == 0
17
+ []
18
+ else
19
+ k.times.map { |i| _lambdas(i) }
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class DT
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, t: 1, alpha_var: 0.1, eta_var: 0.1, phi_var: 0.1, lr_a: 0.01, lr_b: 0.1, lr_c: 0.55) #, seed: nil)
4
+ model = _new(to_tw(tw), k, t, alpha_var, eta_var, phi_var, lr_a, lr_b, lr_c)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, timepoint: 0)
12
+ _add_doc(prepare_doc(doc), timepoint)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class GDMR
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, degrees: [], alpha: 0.1, eta: 0.01, sigma: 1.0, sigma0: 3.0, alpha_epsilon: 1e-10, seed: nil)
4
+ model = _new(to_tw(tw), k, degrees, alpha, sigma, sigma0, eta, alpha_epsilon, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, metadata: [])
12
+ _add_doc(prepare_doc(doc), metadata.map(&:to_s))
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,43 @@
1
+ module Tomoto
2
+ class HLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, depth: 2, alpha: 0.1, eta: 0.01, gamma: 0.1, seed: nil)
4
+ model = _new(to_tw(tw), depth, alpha, eta, gamma, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def children_topics(topic_id)
12
+ check_topic(topic_id)
13
+ _children_topics(topic_id)
14
+ end
15
+
16
+ def level(topic_id)
17
+ check_topic(topic_id)
18
+ _live_topic?(topic_id) ? _level(topic_id) : -1
19
+ end
20
+
21
+ def live_topic?(topic_id)
22
+ check_topic(topic_id)
23
+ _live_topic?(topic_id)
24
+ end
25
+
26
+ def num_docs_of_topic(topic_id)
27
+ check_topic(topic_id)
28
+ _num_docs_of_topic(topic_id)
29
+ end
30
+
31
+ def parent_topic(topic_id)
32
+ check_topic(topic_id)
33
+ _live_topic?(topic_id) ? _parent_topic(topic_id) : -1
34
+ end
35
+
36
+ private
37
+
38
+ def check_topic(topic_id)
39
+ raise "topic_id must be < K" if topic_id >= k
40
+ raise "train() should be called first" unless @prepared
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,11 @@
1
+ module Tomoto
2
+ class HPA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k1: 1, k2: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), k1, k2, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+ end
11
+ end
@@ -15,9 +15,7 @@ module Tomoto
15
15
  end
16
16
 
17
17
  def add_doc(doc)
18
- raise "cannot add_doc() after train()" if defined?(@prepared)
19
- doc = doc.split(/[[:space:]]+/) unless doc.is_a?(Array)
20
- _add_doc(doc)
18
+ _add_doc(prepare_doc(doc))
21
19
  end
22
20
 
23
21
  def count_by_topics
@@ -47,6 +45,10 @@ module Tomoto
47
45
  _train(iterations, workers)
48
46
  end
49
47
 
48
+ def tw
49
+ TERM_WEIGHT[_tw]
50
+ end
51
+
50
52
  private
51
53
 
52
54
  def prepare
@@ -56,6 +58,12 @@ module Tomoto
56
58
  end
57
59
  end
58
60
 
61
+ def prepare_doc(doc)
62
+ raise "cannot add_doc() after train()" if defined?(@prepared)
63
+ doc = doc.split(/[[:space:]]+/) unless doc.is_a?(Array)
64
+ doc
65
+ end
66
+
59
67
  class << self
60
68
  private
61
69
 
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class LLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), k, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, labels: [])
12
+ _add_doc(prepare_doc(doc), labels)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class MGLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k_g: 1, k_l: 1, t: 3, alpha_g: 0.1, alpha_l: 0.1, alpha_mg: 0.1, alpha_ml: 0.1, eta_g: 0.01) #, eta_l: 0.01, gamma: 0.1, seed: nil)
4
+ model = _new(to_tw(tw), k_g, k_l, t, alpha_g, alpha_l, alpha_mg, alpha_ml, eta_g)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, delimiter: ".")
12
+ _add_doc(prepare_doc(doc), delimiter)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,11 @@
1
+ module Tomoto
2
+ class PA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k1: 1, k2: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), k1, k2, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,15 @@
1
+ module Tomoto
2
+ class PLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, latent_topics: 1, alpha: 0.1, eta: 0.01, seed: nil)
4
+ model = _new(to_tw(tw), latent_topics, alpha, eta, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, labels: [])
12
+ _add_doc(prepare_doc(doc), labels)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,37 @@
1
+ module Tomoto
2
+ class SLDA
3
+ def self.new(tw: :one, min_cf: 0, min_df: 0, rm_top: 0, k: 1, vars: "", alpha: 0.1, eta: 0.01, mu: [], nu_sq: [], glm_param: [], seed: nil)
4
+ model = _new(to_tw(tw), k, vars.split("").map { |v| to_glm(v) }, alpha, eta, mu, nu_sq, glm_param, seed || -1)
5
+ model.instance_variable_set(:@min_cf, min_cf)
6
+ model.instance_variable_set(:@min_df, min_df)
7
+ model.instance_variable_set(:@rm_top, rm_top)
8
+ model
9
+ end
10
+
11
+ def add_doc(doc, y: [])
12
+ _add_doc(prepare_doc(doc), y)
13
+ end
14
+
15
+ def var_type(var_id)
16
+ raise "train() should be called first" unless @prepared
17
+ _var_type(var_id)
18
+ end
19
+
20
+ private
21
+
22
+ class << self
23
+ private
24
+
25
+ def to_glm(v)
26
+ case v
27
+ when "l"
28
+ 0
29
+ when "b"
30
+ 1
31
+ else
32
+ raise "Invalid var: #{v}"
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -1,3 +1,3 @@
1
1
  module Tomoto
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomoto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-09 00:00:00.000000000 Z
11
+ date: 2020-10-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -94,8 +94,18 @@ files:
94
94
  - ext/tomoto/extconf.rb
95
95
  - lib/tomoto.rb
96
96
  - lib/tomoto/ct.rb
97
+ - lib/tomoto/dmr.rb
98
+ - lib/tomoto/dt.rb
99
+ - lib/tomoto/gdmr.rb
97
100
  - lib/tomoto/hdp.rb
101
+ - lib/tomoto/hlda.rb
102
+ - lib/tomoto/hpa.rb
98
103
  - lib/tomoto/lda.rb
104
+ - lib/tomoto/llda.rb
105
+ - lib/tomoto/mglda.rb
106
+ - lib/tomoto/pa.rb
107
+ - lib/tomoto/plda.rb
108
+ - lib/tomoto/slda.rb
99
109
  - lib/tomoto/version.rb
100
110
  - vendor/EigenRand/EigenRand/Core.h
101
111
  - vendor/EigenRand/EigenRand/Dists/Basic.h