fasttext 0.1.3 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/LICENSE.txt +18 -18
- data/README.md +15 -18
- data/ext/fasttext/ext.cpp +100 -134
- data/ext/fasttext/extconf.rb +3 -2
- data/lib/fasttext/classifier.rb +13 -10
- data/lib/fasttext/model.rb +10 -0
- data/lib/fasttext/vectorizer.rb +5 -4
- data/lib/fasttext/version.rb +1 -1
- metadata +13 -70
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '08c75eccaf6de25741a61c5dc5e5647f6387f14fe3589403930a3a1775b7842b'
|
4
|
+
data.tar.gz: 91ea1d96d29539e66857749c15858d3cd1e317ecc9b2988dc07136ecaceee4b4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ae540aef9c79ad8999ce715d19493467dacd71edca501eefcf97f4d5d3200f8c118437a5865b12b04447cf62cf352d04da17cbccbfc41ab2ab0cdc0e27a9b765
|
7
|
+
data.tar.gz: 0e75583df1ae15cebc9b71b362834634194cca865610524dca9332e5e2cc86ca3977cfdc2e635ac2340e08abaf37a5bc4d1a8b18da2e10a2bad09cd03f19626d
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,20 @@
|
|
1
|
+
## 0.2.3 (2021-11-15)
|
2
|
+
|
3
|
+
- Fixed installation error with ARM Mac
|
4
|
+
|
5
|
+
## 0.2.2 (2021-10-16)
|
6
|
+
|
7
|
+
- Fixed `file cannot be opened` errors
|
8
|
+
|
9
|
+
## 0.2.1 (2021-05-23)
|
10
|
+
|
11
|
+
- Improved performance
|
12
|
+
|
13
|
+
## 0.2.0 (2021-05-17)
|
14
|
+
|
15
|
+
- Updated to Rice 4
|
16
|
+
- Dropped support for Ruby < 2.6
|
17
|
+
|
1
18
|
## 0.1.3 (2020-04-28)
|
2
19
|
|
3
20
|
- Updated fastText to 0.9.2
|
data/LICENSE.txt
CHANGED
@@ -1,22 +1,22 @@
|
|
1
|
-
Copyright (c) 2019-2020 Andrew Kane
|
2
|
-
|
3
1
|
MIT License
|
4
2
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
3
|
+
Copyright (c) 2016-present, Facebook, Inc.
|
4
|
+
Copyright (c) 2019-2021 Andrew Kane
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
12
|
|
13
|
-
The above copyright notice and this permission notice shall be
|
14
|
-
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
14
|
+
copies or substantial portions of the Software.
|
15
15
|
|
16
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
OF
|
22
|
-
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
SOFTWARE.
|
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
# fastText
|
1
|
+
# fastText Ruby
|
2
2
|
|
3
3
|
[fastText](https://fasttext.cc) - efficient text classification and representation learning - for Ruby
|
4
4
|
|
5
|
-
[![Build Status](https://
|
5
|
+
[![Build Status](https://github.com/ankane/fastText-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/fastText-ruby/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -127,6 +127,12 @@ Get a word vector
|
|
127
127
|
model.word_vector("carrot")
|
128
128
|
```
|
129
129
|
|
130
|
+
Get a sentence vector
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
model.sentence_vector("sentence text")
|
134
|
+
```
|
135
|
+
|
130
136
|
Get words
|
131
137
|
|
132
138
|
```ruby
|
@@ -172,7 +178,7 @@ FastText::Classifier.new(
|
|
172
178
|
thread: 3, # number of threads
|
173
179
|
lr_update_rate: 100, # change the rate of updates for the learning rate
|
174
180
|
t: 0.0001, # sampling threshold
|
175
|
-
label_prefix: "__label__"
|
181
|
+
label_prefix: "__label__", # label prefix
|
176
182
|
verbose: 2, # verbose
|
177
183
|
pretrained_vectors: nil, # pretrained word vectors (.vec file)
|
178
184
|
autotune_metric: "f1", # autotune optimization metric
|
@@ -232,7 +238,7 @@ __label__spam text from document three
|
|
232
238
|
|
233
239
|
## Pretrained Models
|
234
240
|
|
235
|
-
There are a number of [pretrained models](https://fasttext.cc/docs/en/
|
241
|
+
There are a number of [pretrained models](https://fasttext.cc/docs/en/supervised-models.html) you can download
|
236
242
|
|
237
243
|
### Language Identification
|
238
244
|
|
@@ -248,33 +254,24 @@ Get language predictions
|
|
248
254
|
model.predict("bon appétit")
|
249
255
|
```
|
250
256
|
|
251
|
-
## rbenv
|
252
|
-
|
253
|
-
This library uses [Rice](https://github.com/jasonroelofs/rice) to interface with the fastText C++ library. Rice and earlier versions of rbenv don’t play nicely together. If you encounter an error during installation, upgrade ruby-build and reinstall your Ruby version.
|
254
|
-
|
255
|
-
```sh
|
256
|
-
brew upgrade ruby-build
|
257
|
-
rbenv install [version]
|
258
|
-
```
|
259
|
-
|
260
257
|
## History
|
261
258
|
|
262
|
-
View the [changelog](https://github.com/ankane/
|
259
|
+
View the [changelog](https://github.com/ankane/fastText-ruby/blob/master/CHANGELOG.md)
|
263
260
|
|
264
261
|
## Contributing
|
265
262
|
|
266
263
|
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
267
264
|
|
268
|
-
- [Report bugs](https://github.com/ankane/
|
269
|
-
- Fix bugs and [submit pull requests](https://github.com/ankane/
|
265
|
+
- [Report bugs](https://github.com/ankane/fastText-ruby/issues)
|
266
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/fastText-ruby/pulls)
|
270
267
|
- Write, clarify, or fix documentation
|
271
268
|
- Suggest or add new features
|
272
269
|
|
273
270
|
To get started with development:
|
274
271
|
|
275
272
|
```sh
|
276
|
-
git clone https://github.com/ankane/fastText.git
|
277
|
-
cd fastText
|
273
|
+
git clone --recursive https://github.com/ankane/fastText-ruby.git
|
274
|
+
cd fastText-ruby
|
278
275
|
bundle install
|
279
276
|
bundle exec rake compile
|
280
277
|
bundle exec rake test
|
data/ext/fasttext/ext.cpp
CHANGED
@@ -13,129 +13,37 @@
|
|
13
13
|
#include <vector.h>
|
14
14
|
|
15
15
|
// rice
|
16
|
-
#include <rice/
|
17
|
-
#include <rice/
|
18
|
-
#include <rice/Data_Type.hpp>
|
19
|
-
#include <rice/Hash.hpp>
|
16
|
+
#include <rice/rice.hpp>
|
17
|
+
#include <rice/stl.hpp>
|
20
18
|
|
19
|
+
using fasttext::Args;
|
21
20
|
using fasttext::FastText;
|
22
21
|
|
23
22
|
using Rice::Array;
|
24
23
|
using Rice::Constructor;
|
25
|
-
using Rice::Hash;
|
26
24
|
using Rice::Module;
|
27
|
-
using Rice::Object;
|
28
25
|
using Rice::define_class_under;
|
29
26
|
using Rice::define_module;
|
30
27
|
using Rice::define_module_under;
|
31
28
|
|
32
|
-
|
33
|
-
inline
|
34
|
-
Object to_ruby<std::vector<std::pair<fasttext::real, std::string>>>(std::vector<std::pair<fasttext::real, std::string>> const & x)
|
29
|
+
namespace Rice::detail
|
35
30
|
{
|
36
|
-
|
37
|
-
|
38
|
-
Array a;
|
39
|
-
a.push(v.first);
|
40
|
-
a.push(v.second);
|
41
|
-
ret.push(a);
|
42
|
-
}
|
43
|
-
return ret;
|
44
|
-
}
|
45
|
-
|
46
|
-
fasttext::Args buildArgs(Hash h) {
|
47
|
-
fasttext::Args a;
|
48
|
-
|
49
|
-
std::vector<Hash::Entry> v;
|
50
|
-
Hash::iterator it = h.begin();
|
51
|
-
Hash::iterator end = h.end();
|
52
|
-
|
53
|
-
for(; it != end; ++it)
|
31
|
+
template<>
|
32
|
+
class To_Ruby<std::vector<std::pair<fasttext::real, std::string>>>
|
54
33
|
{
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
} else if (name == "lr_update_rate") {
|
65
|
-
a.lrUpdateRate = from_ruby<int>(value);
|
66
|
-
} else if (name == "dim") {
|
67
|
-
a.dim = from_ruby<int>(value);
|
68
|
-
} else if (name == "ws") {
|
69
|
-
a.ws = from_ruby<int>(value);
|
70
|
-
} else if (name == "epoch") {
|
71
|
-
a.epoch = from_ruby<int>(value);
|
72
|
-
} else if (name == "min_count") {
|
73
|
-
a.minCount = from_ruby<int>(value);
|
74
|
-
} else if (name == "min_count_label") {
|
75
|
-
a.minCountLabel = from_ruby<int>(value);
|
76
|
-
} else if (name == "neg") {
|
77
|
-
a.neg = from_ruby<int>(value);
|
78
|
-
} else if (name == "word_ngrams") {
|
79
|
-
a.wordNgrams = from_ruby<int>(value);
|
80
|
-
} else if (name == "loss") {
|
81
|
-
std::string str = from_ruby<std::string>(value);
|
82
|
-
if (str == "softmax") {
|
83
|
-
a.loss = fasttext::loss_name::softmax;
|
84
|
-
} else if (str == "ns") {
|
85
|
-
a.loss = fasttext::loss_name::ns;
|
86
|
-
} else if (str == "hs") {
|
87
|
-
a.loss = fasttext::loss_name::hs;
|
88
|
-
} else if (str == "ova") {
|
89
|
-
a.loss = fasttext::loss_name::ova;
|
90
|
-
} else {
|
91
|
-
throw std::invalid_argument("Unknown loss: " + str);
|
34
|
+
public:
|
35
|
+
VALUE convert(std::vector<std::pair<fasttext::real, std::string>> const & x)
|
36
|
+
{
|
37
|
+
Array ret;
|
38
|
+
for (const auto& v : x) {
|
39
|
+
Array a;
|
40
|
+
a.push(v.first);
|
41
|
+
a.push(v.second);
|
42
|
+
ret.push(a);
|
92
43
|
}
|
93
|
-
|
94
|
-
std::string str = from_ruby<std::string>(value);
|
95
|
-
if (str == "supervised") {
|
96
|
-
a.model = fasttext::model_name::sup;
|
97
|
-
} else if (str == "skipgram") {
|
98
|
-
a.model = fasttext::model_name::sg;
|
99
|
-
} else if (str == "cbow") {
|
100
|
-
a.model = fasttext::model_name::cbow;
|
101
|
-
} else {
|
102
|
-
throw std::invalid_argument("Unknown model: " + str);
|
103
|
-
}
|
104
|
-
} else if (name == "bucket") {
|
105
|
-
a.bucket = from_ruby<int>(value);
|
106
|
-
} else if (name == "minn") {
|
107
|
-
a.minn = from_ruby<int>(value);
|
108
|
-
} else if (name == "maxn") {
|
109
|
-
a.maxn = from_ruby<int>(value);
|
110
|
-
} else if (name == "thread") {
|
111
|
-
a.thread = from_ruby<int>(value);
|
112
|
-
} else if (name == "t") {
|
113
|
-
a.t = from_ruby<double>(value);
|
114
|
-
} else if (name == "label_prefix") {
|
115
|
-
a.label = from_ruby<std::string>(value);
|
116
|
-
} else if (name == "verbose") {
|
117
|
-
a.verbose = from_ruby<int>(value);
|
118
|
-
} else if (name == "pretrained_vectors") {
|
119
|
-
a.pretrainedVectors = from_ruby<std::string>(value);
|
120
|
-
} else if (name == "save_output") {
|
121
|
-
a.saveOutput = from_ruby<bool>(value);
|
122
|
-
} else if (name == "seed") {
|
123
|
-
a.seed = from_ruby<int>(value);
|
124
|
-
} else if (name == "autotune_validation_file") {
|
125
|
-
a.autotuneValidationFile = from_ruby<std::string>(value);
|
126
|
-
} else if (name == "autotune_metric") {
|
127
|
-
a.autotuneMetric = from_ruby<std::string>(value);
|
128
|
-
} else if (name == "autotune_predictions") {
|
129
|
-
a.autotunePredictions = from_ruby<int>(value);
|
130
|
-
} else if (name == "autotune_duration") {
|
131
|
-
a.autotuneDuration = from_ruby<int>(value);
|
132
|
-
} else if (name == "autotune_model_size") {
|
133
|
-
a.autotuneModelSize = from_ruby<std::string>(value);
|
134
|
-
} else {
|
135
|
-
throw std::invalid_argument("Unknown argument: " + name);
|
44
|
+
return ret;
|
136
45
|
}
|
137
|
-
}
|
138
|
-
return a;
|
46
|
+
};
|
139
47
|
}
|
140
48
|
|
141
49
|
extern "C"
|
@@ -144,11 +52,68 @@ void Init_ext()
|
|
144
52
|
Module rb_mFastText = define_module("FastText");
|
145
53
|
Module rb_mExt = define_module_under(rb_mFastText, "Ext");
|
146
54
|
|
55
|
+
define_class_under<Args>(rb_mExt, "Args")
|
56
|
+
.define_constructor(Constructor<Args>())
|
57
|
+
.define_attr("input", &Args::input)
|
58
|
+
.define_attr("output", &Args::output)
|
59
|
+
.define_attr("lr", &Args::lr)
|
60
|
+
.define_attr("lr_update_rate", &Args::lrUpdateRate)
|
61
|
+
.define_attr("dim", &Args::dim)
|
62
|
+
.define_attr("ws", &Args::ws)
|
63
|
+
.define_attr("epoch", &Args::epoch)
|
64
|
+
.define_attr("min_count", &Args::minCount)
|
65
|
+
.define_attr("min_count_label", &Args::minCountLabel)
|
66
|
+
.define_attr("neg", &Args::neg)
|
67
|
+
.define_attr("word_ngrams", &Args::wordNgrams)
|
68
|
+
.define_method(
|
69
|
+
"loss=",
|
70
|
+
[](Args& a, const std::string& str) {
|
71
|
+
if (str == "softmax") {
|
72
|
+
a.loss = fasttext::loss_name::softmax;
|
73
|
+
} else if (str == "ns") {
|
74
|
+
a.loss = fasttext::loss_name::ns;
|
75
|
+
} else if (str == "hs") {
|
76
|
+
a.loss = fasttext::loss_name::hs;
|
77
|
+
} else if (str == "ova") {
|
78
|
+
a.loss = fasttext::loss_name::ova;
|
79
|
+
} else {
|
80
|
+
throw std::invalid_argument("Unknown loss: " + str);
|
81
|
+
}
|
82
|
+
})
|
83
|
+
.define_method(
|
84
|
+
"model=",
|
85
|
+
[](Args& a, const std::string& str) {
|
86
|
+
if (str == "supervised") {
|
87
|
+
a.model = fasttext::model_name::sup;
|
88
|
+
} else if (str == "skipgram") {
|
89
|
+
a.model = fasttext::model_name::sg;
|
90
|
+
} else if (str == "cbow") {
|
91
|
+
a.model = fasttext::model_name::cbow;
|
92
|
+
} else {
|
93
|
+
throw std::invalid_argument("Unknown model: " + str);
|
94
|
+
}
|
95
|
+
})
|
96
|
+
.define_attr("bucket", &Args::bucket)
|
97
|
+
.define_attr("minn", &Args::minn)
|
98
|
+
.define_attr("maxn", &Args::maxn)
|
99
|
+
.define_attr("thread", &Args::thread)
|
100
|
+
.define_attr("t", &Args::t)
|
101
|
+
.define_attr("label_prefix", &Args::label)
|
102
|
+
.define_attr("verbose", &Args::verbose)
|
103
|
+
.define_attr("pretrained_vectors", &Args::pretrainedVectors)
|
104
|
+
.define_attr("save_output", &Args::saveOutput)
|
105
|
+
.define_attr("seed", &Args::seed)
|
106
|
+
.define_attr("autotune_validation_file", &Args::autotuneValidationFile)
|
107
|
+
.define_attr("autotune_metric", &Args::autotuneMetric)
|
108
|
+
.define_attr("autotune_predictions", &Args::autotunePredictions)
|
109
|
+
.define_attr("autotune_duration", &Args::autotuneDuration)
|
110
|
+
.define_attr("autotune_model_size", &Args::autotuneModelSize);
|
111
|
+
|
147
112
|
define_class_under<FastText>(rb_mExt, "Model")
|
148
113
|
.define_constructor(Constructor<FastText>())
|
149
114
|
.define_method(
|
150
115
|
"words",
|
151
|
-
|
116
|
+
[](FastText& m) {
|
152
117
|
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
|
153
118
|
std::vector<int64_t> freq = d->getCounts(fasttext::entry_type::word);
|
154
119
|
|
@@ -166,7 +131,7 @@ void Init_ext()
|
|
166
131
|
})
|
167
132
|
.define_method(
|
168
133
|
"labels",
|
169
|
-
|
134
|
+
[](FastText& m) {
|
170
135
|
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
|
171
136
|
std::vector<int64_t> freq = d->getCounts(fasttext::entry_type::label);
|
172
137
|
|
@@ -184,7 +149,7 @@ void Init_ext()
|
|
184
149
|
})
|
185
150
|
.define_method(
|
186
151
|
"test",
|
187
|
-
|
152
|
+
[](FastText& m, const std::string& filename, int32_t k) {
|
188
153
|
std::ifstream ifs(filename);
|
189
154
|
if (!ifs.is_open()) {
|
190
155
|
throw std::invalid_argument("Test file cannot be opened!");
|
@@ -201,17 +166,21 @@ void Init_ext()
|
|
201
166
|
})
|
202
167
|
.define_method(
|
203
168
|
"load_model",
|
204
|
-
|
169
|
+
[](FastText& m, const std::string& s) {
|
170
|
+
m.loadModel(s);
|
171
|
+
})
|
205
172
|
.define_method(
|
206
173
|
"save_model",
|
207
|
-
|
174
|
+
[](FastText& m, const std::string& s) {
|
175
|
+
m.saveModel(s);
|
176
|
+
})
|
208
177
|
.define_method("dimension", &FastText::getDimension)
|
209
178
|
.define_method("quantized?", &FastText::isQuant)
|
210
179
|
.define_method("word_id", &FastText::getWordId)
|
211
180
|
.define_method("subword_id", &FastText::getSubwordId)
|
212
181
|
.define_method(
|
213
182
|
"predict",
|
214
|
-
|
183
|
+
[](FastText& m, const std::string& text, int32_t k, float threshold) {
|
215
184
|
std::stringstream ioss(text);
|
216
185
|
std::vector<std::pair<fasttext::real, std::string>> predictions;
|
217
186
|
m.predictLine(ioss, predictions, k, threshold);
|
@@ -219,27 +188,26 @@ void Init_ext()
|
|
219
188
|
})
|
220
189
|
.define_method(
|
221
190
|
"nearest_neighbors",
|
222
|
-
|
191
|
+
[](FastText& m, const std::string& word, int32_t k) {
|
223
192
|
return m.getNN(word, k);
|
224
193
|
})
|
225
194
|
.define_method("analogies", &FastText::getAnalogies)
|
226
|
-
.define_method("ngram_vectors", &FastText::getNgramVectors)
|
195
|
+
// .define_method("ngram_vectors", &FastText::getNgramVectors)
|
227
196
|
.define_method(
|
228
197
|
"word_vector",
|
229
|
-
|
230
|
-
|
198
|
+
[](FastText& m, const std::string& word) {
|
199
|
+
auto dimension = m.getDimension();
|
231
200
|
fasttext::Vector vec = fasttext::Vector(dimension);
|
232
201
|
m.getWordVector(vec, word);
|
233
|
-
float* data = vec.data();
|
234
202
|
Array ret;
|
235
|
-
for (
|
236
|
-
ret.push(
|
203
|
+
for (size_t i = 0; i < vec.size(); i++) {
|
204
|
+
ret.push(vec[i]);
|
237
205
|
}
|
238
206
|
return ret;
|
239
207
|
})
|
240
208
|
.define_method(
|
241
209
|
"subwords",
|
242
|
-
|
210
|
+
[](FastText& m, const std::string& word) {
|
243
211
|
std::vector<std::string> subwords;
|
244
212
|
std::vector<int32_t> ngrams;
|
245
213
|
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
|
@@ -253,22 +221,20 @@ void Init_ext()
|
|
253
221
|
})
|
254
222
|
.define_method(
|
255
223
|
"sentence_vector",
|
256
|
-
|
224
|
+
[](FastText& m, const std::string& text) {
|
257
225
|
std::istringstream in(text);
|
258
|
-
|
226
|
+
auto dimension = m.getDimension();
|
259
227
|
fasttext::Vector vec = fasttext::Vector(dimension);
|
260
228
|
m.getSentenceVector(in, vec);
|
261
|
-
float* data = vec.data();
|
262
229
|
Array ret;
|
263
|
-
for (
|
264
|
-
ret.push(
|
230
|
+
for (size_t i = 0; i < vec.size(); i++) {
|
231
|
+
ret.push(vec[i]);
|
265
232
|
}
|
266
233
|
return ret;
|
267
234
|
})
|
268
235
|
.define_method(
|
269
236
|
"train",
|
270
|
-
|
271
|
-
auto a = buildArgs(h);
|
237
|
+
[](FastText& m, Args& a) {
|
272
238
|
if (a.hasAutotune()) {
|
273
239
|
fasttext::Autotune autotune(std::shared_ptr<fasttext::FastText>(&m, [](fasttext::FastText*) {}));
|
274
240
|
autotune.train(a);
|
@@ -278,17 +244,17 @@ void Init_ext()
|
|
278
244
|
})
|
279
245
|
.define_method(
|
280
246
|
"quantize",
|
281
|
-
|
282
|
-
m.quantize(
|
247
|
+
[](FastText& m, Args& a) {
|
248
|
+
m.quantize(a);
|
283
249
|
})
|
284
250
|
.define_method(
|
285
251
|
"supervised?",
|
286
|
-
|
252
|
+
[](FastText& m) {
|
287
253
|
return m.getArgs().model == fasttext::model_name::sup;
|
288
254
|
})
|
289
255
|
.define_method(
|
290
256
|
"label_prefix",
|
291
|
-
|
257
|
+
[](FastText& m) {
|
292
258
|
return m.getArgs().label;
|
293
259
|
});
|
294
260
|
}
|
data/ext/fasttext/extconf.rb
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
require "mkmf-rice"
|
2
2
|
|
3
|
-
#
|
3
|
+
# -march=native not supported with ARM Mac
|
4
|
+
default_optflags = RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm/i ? "" : "-march=native"
|
4
5
|
# -pthread and -O3 set by default
|
5
|
-
$CXXFLAGS << " -std=c++
|
6
|
+
$CXXFLAGS << " -std=c++17 $(optflags) -funroll-loops " << with_config("optflags", default_optflags)
|
6
7
|
|
7
8
|
ext = File.expand_path(".", __dir__)
|
8
9
|
fasttext = File.expand_path("../../vendor/fastText/src", __dir__)
|
data/lib/fasttext/classifier.rb
CHANGED
@@ -30,14 +30,16 @@ module FastText
|
|
30
30
|
}
|
31
31
|
|
32
32
|
def fit(x, y = nil, autotune_set: nil)
|
33
|
-
input = input_path(x, y)
|
33
|
+
input, _ref = input_path(x, y)
|
34
34
|
@m ||= Ext::Model.new
|
35
|
-
|
35
|
+
a = build_args(DEFAULT_OPTIONS)
|
36
|
+
a.input = input
|
37
|
+
a.model = "supervised"
|
36
38
|
if autotune_set
|
37
39
|
x, y = autotune_set
|
38
|
-
|
40
|
+
a.autotune_validation_file, _autotune_ref = input_path(x, y)
|
39
41
|
end
|
40
|
-
m.train(
|
42
|
+
m.train(a)
|
41
43
|
end
|
42
44
|
|
43
45
|
def predict(text, k: 1, threshold: 0.0)
|
@@ -47,16 +49,16 @@ module FastText
|
|
47
49
|
# TODO predict multiple in C++ for performance
|
48
50
|
result =
|
49
51
|
text.map do |t|
|
50
|
-
m.predict(prep_text(t), k, threshold).
|
52
|
+
m.predict(prep_text(t), k, threshold).to_h do |v|
|
51
53
|
[remove_prefix(v[1]), v[0]]
|
52
|
-
end
|
54
|
+
end
|
53
55
|
end
|
54
56
|
|
55
57
|
multiple ? result : result.first
|
56
58
|
end
|
57
59
|
|
58
60
|
def test(x, y = nil, k: 1)
|
59
|
-
input = input_path(x, y)
|
61
|
+
input, _ref = input_path(x, y)
|
60
62
|
res = m.test(input, k)
|
61
63
|
{
|
62
64
|
examples: res[0],
|
@@ -67,7 +69,8 @@ module FastText
|
|
67
69
|
|
68
70
|
# TODO support options
|
69
71
|
def quantize
|
70
|
-
|
72
|
+
a = Ext::Args.new
|
73
|
+
m.quantize(a)
|
71
74
|
end
|
72
75
|
|
73
76
|
def labels(include_freq: false)
|
@@ -85,7 +88,7 @@ module FastText
|
|
85
88
|
def input_path(x, y)
|
86
89
|
if x.is_a?(String)
|
87
90
|
raise ArgumentError, "Cannot pass y with file" if y
|
88
|
-
x
|
91
|
+
[x, nil]
|
89
92
|
else
|
90
93
|
tempfile = Tempfile.new("fasttext")
|
91
94
|
x.zip(y) do |xi, yi|
|
@@ -95,7 +98,7 @@ module FastText
|
|
95
98
|
tempfile.write("\n")
|
96
99
|
end
|
97
100
|
tempfile.close
|
98
|
-
tempfile.path
|
101
|
+
[tempfile.path, tempfile]
|
99
102
|
end
|
100
103
|
end
|
101
104
|
|
data/lib/fasttext/model.rb
CHANGED
@@ -56,5 +56,15 @@ module FastText
|
|
56
56
|
def m
|
57
57
|
@m || (raise Error, "Not fit")
|
58
58
|
end
|
59
|
+
|
60
|
+
def build_args(default_options)
|
61
|
+
a = Ext::Args.new
|
62
|
+
opts = @options.dup
|
63
|
+
default_options.each do |k, v|
|
64
|
+
a.send("#{k}=", opts.delete(k) || v)
|
65
|
+
end
|
66
|
+
raise ArgumentError, "Unknown argument: #{opts.keys.first}" if opts.any?
|
67
|
+
a
|
68
|
+
end
|
59
69
|
end
|
60
70
|
end
|
data/lib/fasttext/vectorizer.rb
CHANGED
@@ -29,9 +29,10 @@ module FastText
|
|
29
29
|
}
|
30
30
|
|
31
31
|
def fit(x)
|
32
|
-
input = input_path(x)
|
33
32
|
@m ||= Ext::Model.new
|
34
|
-
|
33
|
+
a = build_args(DEFAULT_OPTIONS)
|
34
|
+
a.input, _ref = input_path(x)
|
35
|
+
m.train(a)
|
35
36
|
end
|
36
37
|
|
37
38
|
def nearest_neighbors(word, k: 10)
|
@@ -48,7 +49,7 @@ module FastText
|
|
48
49
|
# https://github.com/facebookresearch/fastText/issues/518
|
49
50
|
def input_path(x)
|
50
51
|
if x.is_a?(String)
|
51
|
-
x
|
52
|
+
[x, nil]
|
52
53
|
else
|
53
54
|
tempfile = Tempfile.new("fasttext")
|
54
55
|
x.each do |xi|
|
@@ -56,7 +57,7 @@ module FastText
|
|
56
57
|
tempfile.write("\n")
|
57
58
|
end
|
58
59
|
tempfile.close
|
59
|
-
tempfile.path
|
60
|
+
[tempfile.path, tempfile]
|
60
61
|
end
|
61
62
|
end
|
62
63
|
end
|
data/lib/fasttext/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fasttext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-11-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|
@@ -16,72 +16,16 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 4.0.2
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
27
|
-
|
28
|
-
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ">="
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: rake
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: rake-compiler
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">="
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ">="
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: minitest
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '5'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '5'
|
83
|
-
description:
|
84
|
-
email: andrew@chartkick.com
|
26
|
+
version: 4.0.2
|
27
|
+
description:
|
28
|
+
email: andrew@ankane.org
|
85
29
|
executables: []
|
86
30
|
extensions:
|
87
31
|
- ext/fasttext/extconf.rb
|
@@ -127,11 +71,11 @@ files:
|
|
127
71
|
- vendor/fastText/src/utils.h
|
128
72
|
- vendor/fastText/src/vector.cc
|
129
73
|
- vendor/fastText/src/vector.h
|
130
|
-
homepage: https://github.com/ankane/fastText
|
74
|
+
homepage: https://github.com/ankane/fastText-ruby
|
131
75
|
licenses:
|
132
76
|
- MIT
|
133
77
|
metadata: {}
|
134
|
-
post_install_message:
|
78
|
+
post_install_message:
|
135
79
|
rdoc_options: []
|
136
80
|
require_paths:
|
137
81
|
- lib
|
@@ -139,16 +83,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
139
83
|
requirements:
|
140
84
|
- - ">="
|
141
85
|
- !ruby/object:Gem::Version
|
142
|
-
version: '2.
|
86
|
+
version: '2.6'
|
143
87
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
144
88
|
requirements:
|
145
89
|
- - ">="
|
146
90
|
- !ruby/object:Gem::Version
|
147
91
|
version: '0'
|
148
92
|
requirements: []
|
149
|
-
rubygems_version: 3.
|
150
|
-
signing_key:
|
93
|
+
rubygems_version: 3.2.22
|
94
|
+
signing_key:
|
151
95
|
specification_version: 4
|
152
|
-
summary:
|
153
|
-
Ruby
|
96
|
+
summary: Efficient text classification and representation learning for Ruby
|
154
97
|
test_files: []
|