tomoto 0.3.2-x86_64-darwin → 0.3.3-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +1 -1
- data/README.md +4 -14
- data/ext/tomoto/llda.cpp +13 -0
- data/lib/tomoto/2.7/tomoto.bundle +0 -0
- data/lib/tomoto/3.0/tomoto.bundle +0 -0
- data/lib/tomoto/3.1/tomoto.bundle +0 -0
- data/lib/tomoto/3.2/tomoto.bundle +0 -0
- data/lib/tomoto/lda.rb +6 -1
- data/lib/tomoto/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b79fb57f7e14e6b483109a2ee2b9905b3ad30c5dc026477494d42238f6c3719d
|
4
|
+
data.tar.gz: 22f74746b73ad822f1fdd1cf8cabdcc28b995d1d3f18097c90ca2894dadb38f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: acbd74efa07f328b5326944bd836bbb55c310a9d4877f645785072bb08aaabf52453b18c2371f70573acb5806f491659460fdb7881d5733bb576b2694727275f
|
7
|
+
data.tar.gz: cd255e7ce35ef651c1cada54406631b9ddf71d7ee29af66a223bb12053b237e1619e294f2c6ff8a3481eac205a37e1d1e60c0b9d30cd2280c39b6f2c6c32f2ee
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
MIT License
|
2
2
|
|
3
3
|
Copyright (c) 2019, bab2min
|
4
|
-
Copyright (c) 2020-
|
4
|
+
Copyright (c) 2020-2023 Andrew Kane
|
5
5
|
|
6
6
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
7
|
of this software and associated documentation files (the "Software"), to deal
|
data/README.md
CHANGED
@@ -12,17 +12,15 @@ Add this line to your application’s Gemfile:
|
|
12
12
|
gem "tomoto"
|
13
13
|
```
|
14
14
|
|
15
|
-
ARM is not currently supported
|
16
|
-
|
17
15
|
## Getting Started
|
18
16
|
|
19
17
|
Train a model
|
20
18
|
|
21
19
|
```ruby
|
22
20
|
model = Tomoto::LDA.new(k: 2)
|
23
|
-
model.add_doc("
|
24
|
-
model.add_doc("
|
25
|
-
model.add_doc("
|
21
|
+
model.add_doc(["tokens", "from", "document", "one"])
|
22
|
+
model.add_doc(["tokens", "from", "document", "two"])
|
23
|
+
model.add_doc(["tokens", "from", "document", "three"])
|
26
24
|
model.train(100) # iterations
|
27
25
|
```
|
28
26
|
|
@@ -78,7 +76,7 @@ model.ll_per_word
|
|
78
76
|
Perform inference for unseen documents
|
79
77
|
|
80
78
|
```ruby
|
81
|
-
doc = model.make_doc("unseen doc")
|
79
|
+
doc = model.make_doc(["unseen", "doc"])
|
82
80
|
topic_dist, ll = model.infer(doc)
|
83
81
|
```
|
84
82
|
|
@@ -114,14 +112,6 @@ If a method or option you need isn’t supported, feel free to open an issue.
|
|
114
112
|
- [LDA](examples/lda_basic.rb)
|
115
113
|
- [HDP](examples/hdp_basic.rb)
|
116
114
|
|
117
|
-
## Tokenization
|
118
|
-
|
119
|
-
Documents are tokenized by whitespace by default, or you can perform your own tokenization.
|
120
|
-
|
121
|
-
```ruby
|
122
|
-
model.add_doc(["tokens", "from", "document", "one"])
|
123
|
-
```
|
124
|
-
|
125
115
|
## Performance
|
126
116
|
|
127
117
|
tomoto uses AVX2, AVX, or SSE2 instructions to increase performance on machines that support it. Check which instruction set architecture it’s using with:
|
data/ext/tomoto/llda.cpp
CHANGED
@@ -29,5 +29,18 @@ void init_llda(Rice::Module& m) {
|
|
29
29
|
"topics_per_label",
|
30
30
|
[](tomoto::ILLDAModel& self) {
|
31
31
|
return self.getNumTopicsPerLabel();
|
32
|
+
})
|
33
|
+
.define_method(
|
34
|
+
"topic_label_dict",
|
35
|
+
[](tomoto::ILLDAModel& self) {
|
36
|
+
auto dict = self.getTopicLabelDict();
|
37
|
+
Array res;
|
38
|
+
auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
|
39
|
+
for (size_t i = 0; i < dict.size(); i++) {
|
40
|
+
VALUE value = Rice::detail::To_Ruby<std::string>().convert(dict.toWord(i));
|
41
|
+
Object obj(value);
|
42
|
+
res.push(obj.call("force_encoding", utf8));
|
43
|
+
}
|
44
|
+
return res;
|
32
45
|
});
|
33
46
|
}
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/tomoto/lda.rb
CHANGED
@@ -24,7 +24,7 @@ module Tomoto
|
|
24
24
|
|
25
25
|
# TODO support multiple docs
|
26
26
|
def infer(doc, iter: 100, tolerance: -1, workers: 0, parallel: :default, together: 0)
|
27
|
-
raise "cannot infer with untrained model" unless
|
27
|
+
raise "cannot infer with untrained model" unless trained?
|
28
28
|
_infer(doc, iter, tolerance, workers, to_ps(parallel), together)
|
29
29
|
end
|
30
30
|
|
@@ -86,6 +86,7 @@ module Tomoto
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
+
# TODO raise error if iterations < 1
|
89
90
|
def train(iterations = 10, workers: 0, parallel: :default)
|
90
91
|
prepare
|
91
92
|
_train(iterations, workers, to_ps(parallel))
|
@@ -97,6 +98,10 @@ module Tomoto
|
|
97
98
|
|
98
99
|
private
|
99
100
|
|
101
|
+
def trained?
|
102
|
+
global_step.positive?
|
103
|
+
end
|
104
|
+
|
100
105
|
def prepare
|
101
106
|
unless defined?(@prepared)
|
102
107
|
_prepare(@min_cf, @min_df, @rm_top)
|
data/lib/tomoto/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomoto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-02-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|