tomoto 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1044b496120cf955a03d6dd184056c40572501a68043c3bd3d4cc17334caba3d
4
- data.tar.gz: 7078dd4bdc562cae748ca89aa9b73d38d209d6a40a1e6d2acff0d8fba0c5a18b
3
+ metadata.gz: cd717980f682fd9151cf51a439e3ab54ff59f88575aa8c552b45b769048e9e6b
4
+ data.tar.gz: 80ebf4430f748279d4973ae8fd3949bdaf3043446370a81b7d7c1ed3107358fa
5
5
  SHA512:
6
- metadata.gz: ef9944cc3820397ef18bbca2e42d4a1a0d4b43dd283f3fac8e066478ecc78e74b4a2d32a6f3304203e22fad048ebdacc60637b7f51a42cf6fc73053613a3e77e
7
- data.tar.gz: 8409a754f890f788b6bc33938420311917418cf62f7cfde572db307a85f4473835caa342583ee6e3a6d03f517d89f75714b468351fa50011dbd156a9547c4918
6
+ metadata.gz: 4d3cc8d59f665d7957b1f94d60f688fdbe4d3400d984f716e29a6d32334b46b901f7ef0da6702ce5582cbd90e7c9c42727f04c19820c8e971e6c47d597dcc6f6
7
+ data.tar.gz: d0627badaa6d0ee3b65d30f805e6a5c54d440911e8ec770e69e70c0b1a840d8d0ee6c248fa1479162b2270eeb5038fb2ff5ca6d80a8233a25df83dfb562ea743
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.2.1 (2021-08-23)
2
+
3
+ - Added support for unseen documents
4
+
1
5
  ## 0.2.0 (2021-05-23)
2
6
 
3
7
  - Updated tomoto to 0.12.0
data/README.md CHANGED
@@ -75,6 +75,13 @@ Get the log likelihood per word
75
75
  model.ll_per_word
76
76
  ```
77
77
 
78
+ Perform inference for unseen documents
79
+
80
+ ```ruby
81
+ doc = model.make_doc("unseen doc")
82
+ topic_dist, ll = model.infer(doc)
83
+ ```
84
+
78
85
  ## Models
79
86
 
80
87
  Supports:
data/ext/tomoto/lda.cpp CHANGED
@@ -49,6 +49,30 @@ void init_lda(Rice::Module& m) {
49
49
  *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
50
50
  return self.addDoc(buildDoc(words));
51
51
  })
52
+ .define_method(
53
+ "_make_doc",
54
+ *[](tomoto::ILDAModel& self, std::vector<std::string> words) {
55
+ return DocumentObject(self.makeDoc(buildDoc(words)).release(), &self);
56
+ })
57
+ .define_method(
58
+ "_infer",
59
+ *[](tomoto::ILDAModel& self, DocumentObject& doc_object, size_t iteration, float tolerance, size_t workers, size_t ps, size_t together) {
60
+ std::vector<tomoto::DocumentBase*> docs;
61
+ auto doc = doc_object.doc;
62
+ docs.emplace_back(const_cast<tomoto::DocumentBase*>(doc));
63
+ float ll = self.infer(docs, iteration, tolerance, workers, (tomoto::ParallelScheme)ps, !!together)[0];
64
+
65
+ auto topic_dist = self.getTopicsByDoc(doc);
66
+ auto topic_res = Array();
67
+ for (size_t i = 0; i < topic_dist.size(); i++) {
68
+ topic_res.push(topic_dist[i]);
69
+ }
70
+
71
+ auto res = Array();
72
+ res.push(topic_res);
73
+ res.push(ll);
74
+ return res;
75
+ })
52
76
  .define_method(
53
77
  "alpha",
54
78
  *[](tomoto::ILDAModel& self) {
Binary file
Binary file
Binary file
data/lib/tomoto/lda.rb CHANGED
@@ -18,6 +18,16 @@ module Tomoto
18
18
  _add_doc(prepare_doc(doc))
19
19
  end
20
20
 
21
+ def make_doc(doc)
22
+ _make_doc(tokenize_doc(doc))
23
+ end
24
+
25
+ # TODO support multiple docs
26
+ def infer(doc, iter: 100, tolerance: -1, workers: 0, parallel: :default, together: 0)
27
+ raise "cannot infer with untrained model" unless defined?(@prepared)
28
+ _infer(doc, iter, tolerance, workers, to_ps(parallel), together)
29
+ end
30
+
21
31
  def count_by_topics
22
32
  prepare
23
33
  _count_by_topics
@@ -96,6 +106,10 @@ module Tomoto
96
106
 
97
107
  def prepare_doc(doc)
98
108
  raise "cannot add_doc() after train()" if defined?(@prepared)
109
+ tokenize_doc(doc)
110
+ end
111
+
112
+ def tokenize_doc(doc)
99
113
  doc = doc.split(/[[:space:]]+/) unless doc.is_a?(Array)
100
114
  doc
101
115
  end
@@ -1,3 +1,3 @@
1
1
  module Tomoto
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomoto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-05-23 00:00:00.000000000 Z
11
+ date: 2021-08-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -51,6 +51,9 @@ files:
51
51
  - ext/tomoto/slda.cpp
52
52
  - ext/tomoto/utils.h
53
53
  - lib/tomoto.rb
54
+ - lib/tomoto/2.6/tomoto.so
55
+ - lib/tomoto/2.7/tomoto.so
56
+ - lib/tomoto/3.0/tomoto.so
54
57
  - lib/tomoto/ct.rb
55
58
  - lib/tomoto/dmr.rb
56
59
  - lib/tomoto/dt.rb
@@ -64,8 +67,6 @@ files:
64
67
  - lib/tomoto/pa.rb
65
68
  - lib/tomoto/plda.rb
66
69
  - lib/tomoto/slda.rb
67
- - lib/tomoto/tomoto.bundle
68
- - lib/tomoto/tomoto.so
69
70
  - lib/tomoto/version.rb
70
71
  - vendor/EigenRand/EigenRand/Core.h
71
72
  - vendor/EigenRand/EigenRand/Dists/Basic.h
@@ -508,7 +509,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
508
509
  - !ruby/object:Gem::Version
509
510
  version: '0'
510
511
  requirements: []
511
- rubygems_version: 3.2.3
512
+ rubygems_version: 3.2.22
512
513
  signing_key:
513
514
  specification_version: 4
514
515
  summary: High performance topic modeling for Ruby
Binary file
data/lib/tomoto/tomoto.so DELETED
Binary file