tomoto 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cbe3d5af04af38307a6bb5cf2db40fbc6df243bfaa34a3e53cd3360a2eed809d
4
- data.tar.gz: 88cab90fd0a3ab1e1554ee9561eda4a23761703171b3262e36f564b6ebec5d8e
3
+ metadata.gz: daa9c454c4cf09e120dbbe17305d225be58ac5937c463886e87ea1b3c3b5d466
4
+ data.tar.gz: f6c0c353a0efcc6026964e9125f1156b50e0d119506ecab2812522f7b716042d
5
5
  SHA512:
6
- metadata.gz: 9dc6d9734f06988259b1c2c45e63549713f4d8071c1b81ffda4a43573b908a0e601b79f5cf23e1ecd94c880dbde16d50a297c6d109a878e2c45c5a3b0edf2158
7
- data.tar.gz: 47a95868b9d750de9aba52ccc11849ee50968c4dc988d670df0d13d62ec14f0dcd49c5709b0b54f8bd66972cd9d2243eedf73632fe61221bfea23b60dfa4e573
6
+ metadata.gz: 874f531a75a62d2291793ded080f380f8103682c2ae2b087dd31a014533443d5f35a7ea4e634aabd246fab1564ece35679c60c0b6ffbb6a627d57048e32bf790
7
+ data.tar.gz: e91bf3c618394f34f208fe4945729db6719a2cab1a8e7192e646b232d7e38274f2087808d145dccddb94b3cc632b9dddce82f942f051a4bf522ec5ec9d2c43b3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.3.3 (2023-02-01)
2
+
3
+ - Added `topic_label_dict` method to `LLDA`
4
+ - Fixed error with `infer` with loaded model
5
+
1
6
  ## 0.3.2 (2023-01-22)
2
7
 
3
8
  - Added precompiled gem for Mac ARM
data/LICENSE.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  MIT License
2
2
 
3
3
  Copyright (c) 2019, bab2min
4
- Copyright (c) 2020-2021 Andrew Kane
4
+ Copyright (c) 2020-2023 Andrew Kane
5
5
 
6
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
7
7
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -12,17 +12,15 @@ Add this line to your application’s Gemfile:
12
12
  gem "tomoto"
13
13
  ```
14
14
 
15
- ARM is not currently supported
16
-
17
15
  ## Getting Started
18
16
 
19
17
  Train a model
20
18
 
21
19
  ```ruby
22
20
  model = Tomoto::LDA.new(k: 2)
23
- model.add_doc("text from document one")
24
- model.add_doc("text from document two")
25
- model.add_doc("text from document three")
21
+ model.add_doc(["tokens", "from", "document", "one"])
22
+ model.add_doc(["tokens", "from", "document", "two"])
23
+ model.add_doc(["tokens", "from", "document", "three"])
26
24
  model.train(100) # iterations
27
25
  ```
28
26
 
@@ -78,7 +76,7 @@ model.ll_per_word
78
76
  Perform inference for unseen documents
79
77
 
80
78
  ```ruby
81
- doc = model.make_doc("unseen doc")
79
+ doc = model.make_doc(["unseen", "doc"])
82
80
  topic_dist, ll = model.infer(doc)
83
81
  ```
84
82
 
@@ -114,14 +112,6 @@ If a method or option you need isn’t supported, feel free to open an issue.
114
112
  - [LDA](examples/lda_basic.rb)
115
113
  - [HDP](examples/hdp_basic.rb)
116
114
 
117
- ## Tokenization
118
-
119
- Documents are tokenized by whitespace by default, or you can perform your own tokenization.
120
-
121
- ```ruby
122
- model.add_doc(["tokens", "from", "document", "one"])
123
- ```
124
-
125
115
  ## Performance
126
116
 
127
117
  tomoto uses AVX2, AVX, or SSE2 instructions to increase performance on machines that support it. Check which instruction set architecture it’s using with:
data/ext/tomoto/llda.cpp CHANGED
@@ -29,5 +29,18 @@ void init_llda(Rice::Module& m) {
29
29
  "topics_per_label",
30
30
  [](tomoto::ILLDAModel& self) {
31
31
  return self.getNumTopicsPerLabel();
32
+ })
33
+ .define_method(
34
+ "topic_label_dict",
35
+ [](tomoto::ILLDAModel& self) {
36
+ auto dict = self.getTopicLabelDict();
37
+ Array res;
38
+ auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
39
+ for (size_t i = 0; i < dict.size(); i++) {
40
+ VALUE value = Rice::detail::To_Ruby<std::string>().convert(dict.toWord(i));
41
+ Object obj(value);
42
+ res.push(obj.call("force_encoding", utf8));
43
+ }
44
+ return res;
32
45
  });
33
46
  }
data/lib/tomoto/lda.rb CHANGED
@@ -24,7 +24,7 @@ module Tomoto
24
24
 
25
25
  # TODO support multiple docs
26
26
  def infer(doc, iter: 100, tolerance: -1, workers: 0, parallel: :default, together: 0)
27
- raise "cannot infer with untrained model" unless defined?(@prepared)
27
+ raise "cannot infer with untrained model" unless trained?
28
28
  _infer(doc, iter, tolerance, workers, to_ps(parallel), together)
29
29
  end
30
30
 
@@ -86,6 +86,7 @@ module Tomoto
86
86
  end
87
87
  end
88
88
 
89
+ # TODO raise error if iterations < 1
89
90
  def train(iterations = 10, workers: 0, parallel: :default)
90
91
  prepare
91
92
  _train(iterations, workers, to_ps(parallel))
@@ -97,6 +98,10 @@ module Tomoto
97
98
 
98
99
  private
99
100
 
101
+ def trained?
102
+ global_step.positive?
103
+ end
104
+
100
105
  def prepare
101
106
  unless defined?(@prepared)
102
107
  _prepare(@min_cf, @min_df, @rm_top)
@@ -1,3 +1,3 @@
1
1
  module Tomoto
2
- VERSION = "0.3.2"
2
+ VERSION = "0.3.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomoto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-23 00:00:00.000000000 Z
11
+ date: 2023-02-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice