fasttext 0.1.2 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/LICENSE.txt +18 -18
- data/README.md +26 -19
- data/ext/fasttext/ext.cpp +131 -134
- data/ext/fasttext/extconf.rb +2 -4
- data/lib/fasttext/classifier.rb +23 -10
- data/lib/fasttext/model.rb +10 -0
- data/lib/fasttext/vectorizer.rb +11 -5
- data/lib/fasttext/version.rb +1 -1
- data/vendor/fastText/README.md +3 -3
- data/vendor/fastText/src/args.cc +179 -6
- data/vendor/fastText/src/args.h +29 -1
- data/vendor/fastText/src/autotune.cc +477 -0
- data/vendor/fastText/src/autotune.h +89 -0
- data/vendor/fastText/src/densematrix.cc +27 -7
- data/vendor/fastText/src/densematrix.h +10 -2
- data/vendor/fastText/src/fasttext.cc +125 -114
- data/vendor/fastText/src/fasttext.h +31 -52
- data/vendor/fastText/src/main.cc +32 -13
- data/vendor/fastText/src/meter.cc +148 -2
- data/vendor/fastText/src/meter.h +24 -2
- data/vendor/fastText/src/model.cc +0 -1
- data/vendor/fastText/src/real.h +0 -1
- data/vendor/fastText/src/utils.cc +25 -0
- data/vendor/fastText/src/utils.h +29 -0
- data/vendor/fastText/src/vector.cc +0 -1
- metadata +14 -69
- data/lib/fasttext/ext.bundle +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9aae7e20933f51ebebd802276d7e006ce792fcfc6dd94ea8ceda887ab0f4eca8
|
4
|
+
data.tar.gz: a90a7bbfffe424829052afc3519bc0473a7fc1b22a2d1ea0c8786d9372a0cdb5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3042a798560e5960d18d8bcefb66833bf621bc1dcf26f77742602d0d4e7b1e4cc33b078f9225c209ac9a0d2f89c9b3c98ad2446582af4a3ac4effc761192c58d
|
7
|
+
data.tar.gz: d70ce1005916a809a78b02f23232b1efd7cc59dc9d3ed2df7929f946db2f326c0df40729b2ea953846e906c2774d6b6e971650f3d585da65ae6562bdcb9a9b5e
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,22 @@
|
|
1
|
+
## 0.2.2 (2021-10-16)
|
2
|
+
|
3
|
+
- Fixed `file cannot be opened` errors
|
4
|
+
|
5
|
+
## 0.2.1 (2021-05-23)
|
6
|
+
|
7
|
+
- Improved performance
|
8
|
+
|
9
|
+
## 0.2.0 (2021-05-17)
|
10
|
+
|
11
|
+
- Updated to Rice 4
|
12
|
+
- Dropped support for Ruby < 2.6
|
13
|
+
|
14
|
+
## 0.1.3 (2020-04-28)
|
15
|
+
|
16
|
+
- Updated fastText to 0.9.2
|
17
|
+
- Added support for autotune
|
18
|
+
- Added `--with-optflags` option
|
19
|
+
|
1
20
|
## 0.1.2 (2020-01-10)
|
2
21
|
|
3
22
|
- Fixed installation error with Ruby 2.7
|
data/LICENSE.txt
CHANGED
@@ -1,22 +1,22 @@
|
|
1
|
-
Copyright (c) 2019-2020 Andrew Kane
|
2
|
-
|
3
1
|
MIT License
|
4
2
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
3
|
+
Copyright (c) 2016-present, Facebook, Inc.
|
4
|
+
Copyright (c) 2019-2021 Andrew Kane
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
12
|
|
13
|
-
The above copyright notice and this permission notice shall be
|
14
|
-
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
14
|
+
copies or substantial portions of the Software.
|
15
15
|
|
16
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
OF
|
22
|
-
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
SOFTWARE.
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
[fastText](https://fasttext.cc) - efficient text classification and representation learning - for Ruby
|
4
4
|
|
5
|
-
[![Build Status](https://
|
5
|
+
[![Build Status](https://github.com/ankane/fastText/workflows/build/badge.svg?branch=master)](https://github.com/ankane/fastText/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -77,6 +77,12 @@ model.labels
|
|
77
77
|
|
78
78
|
> Use `include_freq: true` to get their frequency
|
79
79
|
|
80
|
+
Search for the best hyperparameters
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
model.fit(x, y, autotune_set: [x_valid, y_valid])
|
84
|
+
```
|
85
|
+
|
80
86
|
Compress the model - significantly reduces size but sacrifices a little performance
|
81
87
|
|
82
88
|
```ruby
|
@@ -121,6 +127,12 @@ Get a word vector
|
|
121
127
|
model.word_vector("carrot")
|
122
128
|
```
|
123
129
|
|
130
|
+
Get a sentence vector
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
model.sentence_vector("sentence text")
|
134
|
+
```
|
135
|
+
|
124
136
|
Get words
|
125
137
|
|
126
138
|
```ruby
|
@@ -166,9 +178,13 @@ FastText::Classifier.new(
|
|
166
178
|
thread: 3, # number of threads
|
167
179
|
lr_update_rate: 100, # change the rate of updates for the learning rate
|
168
180
|
t: 0.0001, # sampling threshold
|
169
|
-
label_prefix: "__label__"
|
181
|
+
label_prefix: "__label__", # label prefix
|
170
182
|
verbose: 2, # verbose
|
171
|
-
pretrained_vectors: nil
|
183
|
+
pretrained_vectors: nil, # pretrained word vectors (.vec file)
|
184
|
+
autotune_metric: "f1", # autotune optimization metric
|
185
|
+
autotune_predictions: 1, # autotune predictions
|
186
|
+
autotune_duration: 300, # autotune search time in seconds
|
187
|
+
autotune_model_size: nil # autotune model size, like 2M
|
172
188
|
)
|
173
189
|
```
|
174
190
|
|
@@ -200,7 +216,7 @@ FastText::Vectorizer.new(
|
|
200
216
|
Input can be read directly from files
|
201
217
|
|
202
218
|
```ruby
|
203
|
-
model.fit("train.txt")
|
219
|
+
model.fit("train.txt", autotune_set: "valid.txt")
|
204
220
|
model.test("test.txt")
|
205
221
|
```
|
206
222
|
|
@@ -222,7 +238,7 @@ __label__spam text from document three
|
|
222
238
|
|
223
239
|
## Pretrained Models
|
224
240
|
|
225
|
-
There are a number of [pretrained models](https://fasttext.cc/docs/en/
|
241
|
+
There are a number of [pretrained models](https://fasttext.cc/docs/en/supervised-models.html) you can download
|
226
242
|
|
227
243
|
### Language Identification
|
228
244
|
|
@@ -238,15 +254,6 @@ Get language predictions
|
|
238
254
|
model.predict("bon appétit")
|
239
255
|
```
|
240
256
|
|
241
|
-
## rbenv
|
242
|
-
|
243
|
-
This library uses [Rice](https://github.com/jasonroelofs/rice) to interface with the fastText C++ library. Rice and earlier versions of rbenv don’t play nicely together. If you encounter an error during installation, upgrade ruby-build and reinstall your Ruby version.
|
244
|
-
|
245
|
-
```sh
|
246
|
-
brew upgrade ruby-build
|
247
|
-
rbenv install [version]
|
248
|
-
```
|
249
|
-
|
250
257
|
## History
|
251
258
|
|
252
259
|
View the [changelog](https://github.com/ankane/fasttext/blob/master/CHANGELOG.md)
|
@@ -260,12 +267,12 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
|
|
260
267
|
- Write, clarify, or fix documentation
|
261
268
|
- Suggest or add new features
|
262
269
|
|
263
|
-
To get started with development
|
270
|
+
To get started with development:
|
264
271
|
|
265
272
|
```sh
|
266
|
-
git clone https://github.com/ankane/
|
267
|
-
cd
|
273
|
+
git clone --recursive https://github.com/ankane/fastText.git
|
274
|
+
cd fastText
|
268
275
|
bundle install
|
269
|
-
rake compile
|
270
|
-
rake test
|
276
|
+
bundle exec rake compile
|
277
|
+
bundle exec rake test
|
271
278
|
```
|
data/ext/fasttext/ext.cpp
CHANGED
@@ -1,116 +1,49 @@
|
|
1
|
-
|
2
|
-
#include <densematrix.h>
|
3
|
-
#include <fasttext.h>
|
4
|
-
#include <rice/Data_Type.hpp>
|
5
|
-
#include <rice/Constructor.hpp>
|
6
|
-
#include <rice/Array.hpp>
|
7
|
-
#include <rice/Hash.hpp>
|
8
|
-
#include <real.h>
|
9
|
-
#include <vector.h>
|
1
|
+
// stdlib
|
10
2
|
#include <cmath>
|
11
3
|
#include <iterator>
|
12
4
|
#include <sstream>
|
13
5
|
#include <stdexcept>
|
14
6
|
|
15
|
-
|
7
|
+
// fasttext
|
8
|
+
#include <args.h>
|
9
|
+
#include <autotune.h>
|
10
|
+
#include <densematrix.h>
|
11
|
+
#include <fasttext.h>
|
12
|
+
#include <real.h>
|
13
|
+
#include <vector.h>
|
16
14
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
{
|
21
|
-
Array ret;
|
22
|
-
for (const auto& v : x) {
|
23
|
-
Array a;
|
24
|
-
a.push(v.first);
|
25
|
-
a.push(v.second);
|
26
|
-
ret.push(a);
|
27
|
-
}
|
28
|
-
return ret;
|
29
|
-
}
|
15
|
+
// rice
|
16
|
+
#include <rice/rice.hpp>
|
17
|
+
#include <rice/stl.hpp>
|
30
18
|
|
31
|
-
fasttext::Args
|
32
|
-
|
19
|
+
using fasttext::Args;
|
20
|
+
using fasttext::FastText;
|
33
21
|
|
34
|
-
|
35
|
-
|
36
|
-
|
22
|
+
using Rice::Array;
|
23
|
+
using Rice::Constructor;
|
24
|
+
using Rice::Module;
|
25
|
+
using Rice::define_class_under;
|
26
|
+
using Rice::define_module;
|
27
|
+
using Rice::define_module_under;
|
37
28
|
|
38
|
-
|
29
|
+
namespace Rice::detail
|
30
|
+
{
|
31
|
+
template<>
|
32
|
+
class To_Ruby<std::vector<std::pair<fasttext::real, std::string>>>
|
39
33
|
{
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
} else if (name == "lr_update_rate") {
|
50
|
-
a.lrUpdateRate = from_ruby<int>(value);
|
51
|
-
} else if (name == "dim") {
|
52
|
-
a.dim = from_ruby<int>(value);
|
53
|
-
} else if (name == "ws") {
|
54
|
-
a.ws = from_ruby<int>(value);
|
55
|
-
} else if (name == "epoch") {
|
56
|
-
a.epoch = from_ruby<int>(value);
|
57
|
-
} else if (name == "min_count") {
|
58
|
-
a.minCount = from_ruby<int>(value);
|
59
|
-
} else if (name == "min_count_label") {
|
60
|
-
a.minCountLabel = from_ruby<int>(value);
|
61
|
-
} else if (name == "neg") {
|
62
|
-
a.neg = from_ruby<int>(value);
|
63
|
-
} else if (name == "word_ngrams") {
|
64
|
-
a.wordNgrams = from_ruby<int>(value);
|
65
|
-
} else if (name == "loss") {
|
66
|
-
std::string str = from_ruby<std::string>(value);
|
67
|
-
if (str == "softmax") {
|
68
|
-
a.loss = fasttext::loss_name::softmax;
|
69
|
-
} else if (str == "ns") {
|
70
|
-
a.loss = fasttext::loss_name::ns;
|
71
|
-
} else if (str == "hs") {
|
72
|
-
a.loss = fasttext::loss_name::hs;
|
73
|
-
} else if (str == "ova") {
|
74
|
-
a.loss = fasttext::loss_name::ova;
|
75
|
-
} else {
|
76
|
-
throw std::invalid_argument("Unknown loss: " + str);
|
77
|
-
}
|
78
|
-
} else if (name == "model") {
|
79
|
-
std::string str = from_ruby<std::string>(value);
|
80
|
-
if (str == "supervised") {
|
81
|
-
a.model = fasttext::model_name::sup;
|
82
|
-
} else if (str == "skipgram") {
|
83
|
-
a.model = fasttext::model_name::sg;
|
84
|
-
} else if (str == "cbow") {
|
85
|
-
a.model = fasttext::model_name::cbow;
|
86
|
-
} else {
|
87
|
-
throw std::invalid_argument("Unknown model: " + str);
|
34
|
+
public:
|
35
|
+
VALUE convert(std::vector<std::pair<fasttext::real, std::string>> const & x)
|
36
|
+
{
|
37
|
+
Array ret;
|
38
|
+
for (const auto& v : x) {
|
39
|
+
Array a;
|
40
|
+
a.push(v.first);
|
41
|
+
a.push(v.second);
|
42
|
+
ret.push(a);
|
88
43
|
}
|
89
|
-
|
90
|
-
a.bucket = from_ruby<int>(value);
|
91
|
-
} else if (name == "minn") {
|
92
|
-
a.minn = from_ruby<int>(value);
|
93
|
-
} else if (name == "maxn") {
|
94
|
-
a.maxn = from_ruby<int>(value);
|
95
|
-
} else if (name == "thread") {
|
96
|
-
a.thread = from_ruby<int>(value);
|
97
|
-
} else if (name == "t") {
|
98
|
-
a.t = from_ruby<double>(value);
|
99
|
-
} else if (name == "label_prefix") {
|
100
|
-
a.label = from_ruby<std::string>(value);
|
101
|
-
} else if (name == "verbose") {
|
102
|
-
a.verbose = from_ruby<int>(value);
|
103
|
-
} else if (name == "pretrained_vectors") {
|
104
|
-
a.pretrainedVectors = from_ruby<std::string>(value);
|
105
|
-
} else if (name == "save_output") {
|
106
|
-
a.saveOutput = from_ruby<bool>(value);
|
107
|
-
// } else if (name == "seed") {
|
108
|
-
// a.seed = from_ruby<int>(value);
|
109
|
-
} else {
|
110
|
-
throw std::invalid_argument("Unknown argument: " + name);
|
44
|
+
return ret;
|
111
45
|
}
|
112
|
-
}
|
113
|
-
return a;
|
46
|
+
};
|
114
47
|
}
|
115
48
|
|
116
49
|
extern "C"
|
@@ -119,11 +52,68 @@ void Init_ext()
|
|
119
52
|
Module rb_mFastText = define_module("FastText");
|
120
53
|
Module rb_mExt = define_module_under(rb_mFastText, "Ext");
|
121
54
|
|
122
|
-
define_class_under<
|
123
|
-
.define_constructor(Constructor<
|
55
|
+
define_class_under<Args>(rb_mExt, "Args")
|
56
|
+
.define_constructor(Constructor<Args>())
|
57
|
+
.define_attr("input", &Args::input)
|
58
|
+
.define_attr("output", &Args::output)
|
59
|
+
.define_attr("lr", &Args::lr)
|
60
|
+
.define_attr("lr_update_rate", &Args::lrUpdateRate)
|
61
|
+
.define_attr("dim", &Args::dim)
|
62
|
+
.define_attr("ws", &Args::ws)
|
63
|
+
.define_attr("epoch", &Args::epoch)
|
64
|
+
.define_attr("min_count", &Args::minCount)
|
65
|
+
.define_attr("min_count_label", &Args::minCountLabel)
|
66
|
+
.define_attr("neg", &Args::neg)
|
67
|
+
.define_attr("word_ngrams", &Args::wordNgrams)
|
68
|
+
.define_method(
|
69
|
+
"loss=",
|
70
|
+
[](Args& a, const std::string& str) {
|
71
|
+
if (str == "softmax") {
|
72
|
+
a.loss = fasttext::loss_name::softmax;
|
73
|
+
} else if (str == "ns") {
|
74
|
+
a.loss = fasttext::loss_name::ns;
|
75
|
+
} else if (str == "hs") {
|
76
|
+
a.loss = fasttext::loss_name::hs;
|
77
|
+
} else if (str == "ova") {
|
78
|
+
a.loss = fasttext::loss_name::ova;
|
79
|
+
} else {
|
80
|
+
throw std::invalid_argument("Unknown loss: " + str);
|
81
|
+
}
|
82
|
+
})
|
83
|
+
.define_method(
|
84
|
+
"model=",
|
85
|
+
[](Args& a, const std::string& str) {
|
86
|
+
if (str == "supervised") {
|
87
|
+
a.model = fasttext::model_name::sup;
|
88
|
+
} else if (str == "skipgram") {
|
89
|
+
a.model = fasttext::model_name::sg;
|
90
|
+
} else if (str == "cbow") {
|
91
|
+
a.model = fasttext::model_name::cbow;
|
92
|
+
} else {
|
93
|
+
throw std::invalid_argument("Unknown model: " + str);
|
94
|
+
}
|
95
|
+
})
|
96
|
+
.define_attr("bucket", &Args::bucket)
|
97
|
+
.define_attr("minn", &Args::minn)
|
98
|
+
.define_attr("maxn", &Args::maxn)
|
99
|
+
.define_attr("thread", &Args::thread)
|
100
|
+
.define_attr("t", &Args::t)
|
101
|
+
.define_attr("label_prefix", &Args::label)
|
102
|
+
.define_attr("verbose", &Args::verbose)
|
103
|
+
.define_attr("pretrained_vectors", &Args::pretrainedVectors)
|
104
|
+
.define_attr("save_output", &Args::saveOutput)
|
105
|
+
.define_attr("seed", &Args::seed)
|
106
|
+
.define_attr("autotune_validation_file", &Args::autotuneValidationFile)
|
107
|
+
.define_attr("autotune_metric", &Args::autotuneMetric)
|
108
|
+
.define_attr("autotune_predictions", &Args::autotunePredictions)
|
109
|
+
.define_attr("autotune_duration", &Args::autotuneDuration)
|
110
|
+
.define_attr("autotune_model_size", &Args::autotuneModelSize);
|
111
|
+
|
112
|
+
define_class_under<FastText>(rb_mExt, "Model")
|
113
|
+
.define_constructor(Constructor<FastText>())
|
124
114
|
.define_method(
|
125
115
|
"words",
|
126
|
-
|
116
|
+
[](FastText& m) {
|
127
117
|
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
|
128
118
|
std::vector<int64_t> freq = d->getCounts(fasttext::entry_type::word);
|
129
119
|
|
@@ -141,7 +131,7 @@ void Init_ext()
|
|
141
131
|
})
|
142
132
|
.define_method(
|
143
133
|
"labels",
|
144
|
-
|
134
|
+
[](FastText& m) {
|
145
135
|
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
|
146
136
|
std::vector<int64_t> freq = d->getCounts(fasttext::entry_type::label);
|
147
137
|
|
@@ -159,12 +149,12 @@ void Init_ext()
|
|
159
149
|
})
|
160
150
|
.define_method(
|
161
151
|
"test",
|
162
|
-
|
152
|
+
[](FastText& m, const std::string& filename, int32_t k) {
|
163
153
|
std::ifstream ifs(filename);
|
164
154
|
if (!ifs.is_open()) {
|
165
155
|
throw std::invalid_argument("Test file cannot be opened!");
|
166
156
|
}
|
167
|
-
fasttext::Meter meter;
|
157
|
+
fasttext::Meter meter(false);
|
168
158
|
m.test(ifs, k, 0.0, meter);
|
169
159
|
ifs.close();
|
170
160
|
|
@@ -176,17 +166,21 @@ void Init_ext()
|
|
176
166
|
})
|
177
167
|
.define_method(
|
178
168
|
"load_model",
|
179
|
-
|
169
|
+
[](FastText& m, const std::string& s) {
|
170
|
+
m.loadModel(s);
|
171
|
+
})
|
180
172
|
.define_method(
|
181
173
|
"save_model",
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
.define_method("
|
186
|
-
.define_method("
|
174
|
+
[](FastText& m, const std::string& s) {
|
175
|
+
m.saveModel(s);
|
176
|
+
})
|
177
|
+
.define_method("dimension", &FastText::getDimension)
|
178
|
+
.define_method("quantized?", &FastText::isQuant)
|
179
|
+
.define_method("word_id", &FastText::getWordId)
|
180
|
+
.define_method("subword_id", &FastText::getSubwordId)
|
187
181
|
.define_method(
|
188
182
|
"predict",
|
189
|
-
|
183
|
+
[](FastText& m, const std::string& text, int32_t k, float threshold) {
|
190
184
|
std::stringstream ioss(text);
|
191
185
|
std::vector<std::pair<fasttext::real, std::string>> predictions;
|
192
186
|
m.predictLine(ioss, predictions, k, threshold);
|
@@ -194,27 +188,26 @@ void Init_ext()
|
|
194
188
|
})
|
195
189
|
.define_method(
|
196
190
|
"nearest_neighbors",
|
197
|
-
|
191
|
+
[](FastText& m, const std::string& word, int32_t k) {
|
198
192
|
return m.getNN(word, k);
|
199
193
|
})
|
200
|
-
.define_method("analogies", &
|
201
|
-
.define_method("ngram_vectors", &
|
194
|
+
.define_method("analogies", &FastText::getAnalogies)
|
195
|
+
// .define_method("ngram_vectors", &FastText::getNgramVectors)
|
202
196
|
.define_method(
|
203
197
|
"word_vector",
|
204
|
-
|
205
|
-
|
198
|
+
[](FastText& m, const std::string& word) {
|
199
|
+
auto dimension = m.getDimension();
|
206
200
|
fasttext::Vector vec = fasttext::Vector(dimension);
|
207
201
|
m.getWordVector(vec, word);
|
208
|
-
float* data = vec.data();
|
209
202
|
Array ret;
|
210
|
-
for (
|
211
|
-
ret.push(
|
203
|
+
for (size_t i = 0; i < vec.size(); i++) {
|
204
|
+
ret.push(vec[i]);
|
212
205
|
}
|
213
206
|
return ret;
|
214
207
|
})
|
215
208
|
.define_method(
|
216
209
|
"subwords",
|
217
|
-
|
210
|
+
[](FastText& m, const std::string& word) {
|
218
211
|
std::vector<std::string> subwords;
|
219
212
|
std::vector<int32_t> ngrams;
|
220
213
|
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
|
@@ -228,36 +221,40 @@ void Init_ext()
|
|
228
221
|
})
|
229
222
|
.define_method(
|
230
223
|
"sentence_vector",
|
231
|
-
|
224
|
+
[](FastText& m, const std::string& text) {
|
232
225
|
std::istringstream in(text);
|
233
|
-
|
226
|
+
auto dimension = m.getDimension();
|
234
227
|
fasttext::Vector vec = fasttext::Vector(dimension);
|
235
228
|
m.getSentenceVector(in, vec);
|
236
|
-
float* data = vec.data();
|
237
229
|
Array ret;
|
238
|
-
for (
|
239
|
-
ret.push(
|
230
|
+
for (size_t i = 0; i < vec.size(); i++) {
|
231
|
+
ret.push(vec[i]);
|
240
232
|
}
|
241
233
|
return ret;
|
242
234
|
})
|
243
235
|
.define_method(
|
244
236
|
"train",
|
245
|
-
|
246
|
-
|
237
|
+
[](FastText& m, Args& a) {
|
238
|
+
if (a.hasAutotune()) {
|
239
|
+
fasttext::Autotune autotune(std::shared_ptr<fasttext::FastText>(&m, [](fasttext::FastText*) {}));
|
240
|
+
autotune.train(a);
|
241
|
+
} else {
|
242
|
+
m.train(a);
|
243
|
+
}
|
247
244
|
})
|
248
245
|
.define_method(
|
249
246
|
"quantize",
|
250
|
-
|
251
|
-
m.quantize(
|
247
|
+
[](FastText& m, Args& a) {
|
248
|
+
m.quantize(a);
|
252
249
|
})
|
253
250
|
.define_method(
|
254
251
|
"supervised?",
|
255
|
-
|
252
|
+
[](FastText& m) {
|
256
253
|
return m.getArgs().model == fasttext::model_name::sup;
|
257
254
|
})
|
258
255
|
.define_method(
|
259
256
|
"label_prefix",
|
260
|
-
|
257
|
+
[](FastText& m) {
|
261
258
|
return m.getArgs().label;
|
262
259
|
});
|
263
260
|
}
|
data/ext/fasttext/extconf.rb
CHANGED
@@ -1,9 +1,7 @@
|
|
1
1
|
require "mkmf-rice"
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
# TODO use -std=c++14 when available
|
6
|
-
$CXXFLAGS << " -pthread -std=c++11 -funroll-loops -O3 -march=native"
|
3
|
+
# -pthread and -O3 set by default
|
4
|
+
$CXXFLAGS << " -std=c++17 $(optflags) -funroll-loops " << with_config("optflags", "-march=native")
|
7
5
|
|
8
6
|
ext = File.expand_path(".", __dir__)
|
9
7
|
fasttext = File.expand_path("../../vendor/fastText/src", __dir__)
|
data/lib/fasttext/classifier.rb
CHANGED
@@ -21,13 +21,25 @@ module FastText
|
|
21
21
|
verbose: 2,
|
22
22
|
pretrained_vectors: "",
|
23
23
|
save_output: false,
|
24
|
-
|
24
|
+
seed: 0,
|
25
|
+
autotune_validation_file: "",
|
26
|
+
autotune_metric: "f1",
|
27
|
+
autotune_predictions: 1,
|
28
|
+
autotune_duration: 60 * 5,
|
29
|
+
autotune_model_size: ""
|
25
30
|
}
|
26
31
|
|
27
|
-
def fit(x, y = nil)
|
28
|
-
input = input_path(x, y)
|
32
|
+
def fit(x, y = nil, autotune_set: nil)
|
33
|
+
input, _ref = input_path(x, y)
|
29
34
|
@m ||= Ext::Model.new
|
30
|
-
|
35
|
+
a = build_args(DEFAULT_OPTIONS)
|
36
|
+
a.input = input
|
37
|
+
a.model = "supervised"
|
38
|
+
if autotune_set
|
39
|
+
x, y = autotune_set
|
40
|
+
a.autotune_validation_file, _autotune_ref = input_path(x, y)
|
41
|
+
end
|
42
|
+
m.train(a)
|
31
43
|
end
|
32
44
|
|
33
45
|
def predict(text, k: 1, threshold: 0.0)
|
@@ -37,16 +49,16 @@ module FastText
|
|
37
49
|
# TODO predict multiple in C++ for performance
|
38
50
|
result =
|
39
51
|
text.map do |t|
|
40
|
-
m.predict(prep_text(t), k, threshold).
|
52
|
+
m.predict(prep_text(t), k, threshold).to_h do |v|
|
41
53
|
[remove_prefix(v[1]), v[0]]
|
42
|
-
end
|
54
|
+
end
|
43
55
|
end
|
44
56
|
|
45
57
|
multiple ? result : result.first
|
46
58
|
end
|
47
59
|
|
48
60
|
def test(x, y = nil, k: 1)
|
49
|
-
input = input_path(x, y)
|
61
|
+
input, _ref = input_path(x, y)
|
50
62
|
res = m.test(input, k)
|
51
63
|
{
|
52
64
|
examples: res[0],
|
@@ -57,7 +69,8 @@ module FastText
|
|
57
69
|
|
58
70
|
# TODO support options
|
59
71
|
def quantize
|
60
|
-
|
72
|
+
a = Ext::Args.new
|
73
|
+
m.quantize(a)
|
61
74
|
end
|
62
75
|
|
63
76
|
def labels(include_freq: false)
|
@@ -75,7 +88,7 @@ module FastText
|
|
75
88
|
def input_path(x, y)
|
76
89
|
if x.is_a?(String)
|
77
90
|
raise ArgumentError, "Cannot pass y with file" if y
|
78
|
-
x
|
91
|
+
[x, nil]
|
79
92
|
else
|
80
93
|
tempfile = Tempfile.new("fasttext")
|
81
94
|
x.zip(y) do |xi, yi|
|
@@ -85,7 +98,7 @@ module FastText
|
|
85
98
|
tempfile.write("\n")
|
86
99
|
end
|
87
100
|
tempfile.close
|
88
|
-
tempfile.path
|
101
|
+
[tempfile.path, tempfile]
|
89
102
|
end
|
90
103
|
end
|
91
104
|
|
data/lib/fasttext/model.rb
CHANGED
@@ -56,5 +56,15 @@ module FastText
|
|
56
56
|
def m
|
57
57
|
@m || (raise Error, "Not fit")
|
58
58
|
end
|
59
|
+
|
60
|
+
def build_args(default_options)
|
61
|
+
a = Ext::Args.new
|
62
|
+
opts = @options.dup
|
63
|
+
default_options.each do |k, v|
|
64
|
+
a.send("#{k}=", opts.delete(k) || v)
|
65
|
+
end
|
66
|
+
raise ArgumentError, "Unknown argument: #{opts.keys.first}" if opts.any?
|
67
|
+
a
|
68
|
+
end
|
59
69
|
end
|
60
70
|
end
|