tomoto 0.3.2-arm64-darwin → 0.3.3-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +1 -1
- data/README.md +4 -14
- data/ext/tomoto/llda.cpp +13 -0
- data/lib/tomoto/2.7/tomoto.bundle +0 -0
- data/lib/tomoto/3.0/tomoto.bundle +0 -0
- data/lib/tomoto/3.1/tomoto.bundle +0 -0
- data/lib/tomoto/3.2/tomoto.bundle +0 -0
- data/lib/tomoto/lda.rb +6 -1
- data/lib/tomoto/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5a103162a422fa8a45dcab76b8ee72ea3e7e755d4f924666601f204d868ea13e
|
4
|
+
data.tar.gz: ac60a94e66a6518bbd36a17e05d3bc79b4bbb0b32b14aa619d1a6cda2cef4b7b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2d4891fbccb8fcd5572fdbdea4d9ffe6aa7684d16b1690feadad3a99a12b4c799263ba7ae86150ca0980357572ad65e95fcdcb2ce7a47f52ab78140d2752ef53
|
7
|
+
data.tar.gz: 6d4d3565f2c9db5f4b04f68428006f0ad7de028a1065411f6131e6468a332f53e601a507f92604423aa2eabf0b7a695cfbb5cf62fa622c364e82a13a81ef7465
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
MIT License
|
2
2
|
|
3
3
|
Copyright (c) 2019, bab2min
|
4
|
-
Copyright (c) 2020-
|
4
|
+
Copyright (c) 2020-2023 Andrew Kane
|
5
5
|
|
6
6
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
7
|
of this software and associated documentation files (the "Software"), to deal
|
data/README.md
CHANGED
@@ -12,17 +12,15 @@ Add this line to your application’s Gemfile:
|
|
12
12
|
gem "tomoto"
|
13
13
|
```
|
14
14
|
|
15
|
-
ARM is not currently supported
|
16
|
-
|
17
15
|
## Getting Started
|
18
16
|
|
19
17
|
Train a model
|
20
18
|
|
21
19
|
```ruby
|
22
20
|
model = Tomoto::LDA.new(k: 2)
|
23
|
-
model.add_doc("
|
24
|
-
model.add_doc("
|
25
|
-
model.add_doc("
|
21
|
+
model.add_doc(["tokens", "from", "document", "one"])
|
22
|
+
model.add_doc(["tokens", "from", "document", "two"])
|
23
|
+
model.add_doc(["tokens", "from", "document", "three"])
|
26
24
|
model.train(100) # iterations
|
27
25
|
```
|
28
26
|
|
@@ -78,7 +76,7 @@ model.ll_per_word
|
|
78
76
|
Perform inference for unseen documents
|
79
77
|
|
80
78
|
```ruby
|
81
|
-
doc = model.make_doc("unseen doc")
|
79
|
+
doc = model.make_doc(["unseen", "doc"])
|
82
80
|
topic_dist, ll = model.infer(doc)
|
83
81
|
```
|
84
82
|
|
@@ -114,14 +112,6 @@ If a method or option you need isn’t supported, feel free to open an issue.
|
|
114
112
|
- [LDA](examples/lda_basic.rb)
|
115
113
|
- [HDP](examples/hdp_basic.rb)
|
116
114
|
|
117
|
-
## Tokenization
|
118
|
-
|
119
|
-
Documents are tokenized by whitespace by default, or you can perform your own tokenization.
|
120
|
-
|
121
|
-
```ruby
|
122
|
-
model.add_doc(["tokens", "from", "document", "one"])
|
123
|
-
```
|
124
|
-
|
125
115
|
## Performance
|
126
116
|
|
127
117
|
tomoto uses AVX2, AVX, or SSE2 instructions to increase performance on machines that support it. Check which instruction set architecture it’s using with:
|
data/ext/tomoto/llda.cpp
CHANGED
@@ -29,5 +29,18 @@ void init_llda(Rice::Module& m) {
|
|
29
29
|
"topics_per_label",
|
30
30
|
[](tomoto::ILLDAModel& self) {
|
31
31
|
return self.getNumTopicsPerLabel();
|
32
|
+
})
|
33
|
+
.define_method(
|
34
|
+
"topic_label_dict",
|
35
|
+
[](tomoto::ILLDAModel& self) {
|
36
|
+
auto dict = self.getTopicLabelDict();
|
37
|
+
Array res;
|
38
|
+
auto utf8 = Rice::Class(rb_cEncoding).call("const_get", "UTF_8");
|
39
|
+
for (size_t i = 0; i < dict.size(); i++) {
|
40
|
+
VALUE value = Rice::detail::To_Ruby<std::string>().convert(dict.toWord(i));
|
41
|
+
Object obj(value);
|
42
|
+
res.push(obj.call("force_encoding", utf8));
|
43
|
+
}
|
44
|
+
return res;
|
32
45
|
});
|
33
46
|
}
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/tomoto/lda.rb
CHANGED
@@ -24,7 +24,7 @@ module Tomoto
|
|
24
24
|
|
25
25
|
# TODO support multiple docs
|
26
26
|
def infer(doc, iter: 100, tolerance: -1, workers: 0, parallel: :default, together: 0)
|
27
|
-
raise "cannot infer with untrained model" unless
|
27
|
+
raise "cannot infer with untrained model" unless trained?
|
28
28
|
_infer(doc, iter, tolerance, workers, to_ps(parallel), together)
|
29
29
|
end
|
30
30
|
|
@@ -86,6 +86,7 @@ module Tomoto
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
+
# TODO raise error if iterations < 1
|
89
90
|
def train(iterations = 10, workers: 0, parallel: :default)
|
90
91
|
prepare
|
91
92
|
_train(iterations, workers, to_ps(parallel))
|
@@ -97,6 +98,10 @@ module Tomoto
|
|
97
98
|
|
98
99
|
private
|
99
100
|
|
101
|
+
def trained?
|
102
|
+
global_step.positive?
|
103
|
+
end
|
104
|
+
|
100
105
|
def prepare
|
101
106
|
unless defined?(@prepared)
|
102
107
|
_prepare(@min_cf, @min_df, @rm_top)
|
data/lib/tomoto/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomoto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: arm64-darwin
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-02-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|