fasttext 0.1.2 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f83be8c01c6a45a90758ccee430b3898396bcfdda5a2c338126ed9dc3620aea5
4
- data.tar.gz: 7e3dee8eb3afe12745f78448fd01ac68a2f0ac946bd01195f2f6e6081f62fbad
3
+ metadata.gz: 9aae7e20933f51ebebd802276d7e006ce792fcfc6dd94ea8ceda887ab0f4eca8
4
+ data.tar.gz: a90a7bbfffe424829052afc3519bc0473a7fc1b22a2d1ea0c8786d9372a0cdb5
5
5
  SHA512:
6
- metadata.gz: be3117e1aceed3f6126fc1d84eb87caf53abb3be802a419ebcaa284cb567ee9ac033d5860842690bbd1c0477a6e5013dfc36eb96f6fb067a63015150fa18a1fe
7
- data.tar.gz: dc2467f3f7317b5e1955ede144d9ad50c5abb1cc2b9dc5ad356350f192631b0142c91aecc4dd03b331d90f1cf65c7f3ba3ec176c17dc6b66a1226171261de1b6
6
+ metadata.gz: 3042a798560e5960d18d8bcefb66833bf621bc1dcf26f77742602d0d4e7b1e4cc33b078f9225c209ac9a0d2f89c9b3c98ad2446582af4a3ac4effc761192c58d
7
+ data.tar.gz: d70ce1005916a809a78b02f23232b1efd7cc59dc9d3ed2df7929f946db2f326c0df40729b2ea953846e906c2774d6b6e971650f3d585da65ae6562bdcb9a9b5e
data/CHANGELOG.md CHANGED
@@ -1,3 +1,22 @@
1
+ ## 0.2.2 (2021-10-16)
2
+
3
+ - Fixed `file cannot be opened` errors
4
+
5
+ ## 0.2.1 (2021-05-23)
6
+
7
+ - Improved performance
8
+
9
+ ## 0.2.0 (2021-05-17)
10
+
11
+ - Updated to Rice 4
12
+ - Dropped support for Ruby < 2.6
13
+
14
+ ## 0.1.3 (2020-04-28)
15
+
16
+ - Updated fastText to 0.9.2
17
+ - Added support for autotune
18
+ - Added `--with-optflags` option
19
+
1
20
  ## 0.1.2 (2020-01-10)
2
21
 
3
22
  - Fixed installation error with Ruby 2.7
data/LICENSE.txt CHANGED
@@ -1,22 +1,22 @@
1
- Copyright (c) 2019-2020 Andrew Kane
2
-
3
1
  MIT License
4
2
 
5
- Permission is hereby granted, free of charge, to any person obtaining
6
- a copy of this software and associated documentation files (the
7
- "Software"), to deal in the Software without restriction, including
8
- without limitation the rights to use, copy, modify, merge, publish,
9
- distribute, sublicense, and/or sell copies of the Software, and to
10
- permit persons to whom the Software is furnished to do so, subject to
11
- the following conditions:
3
+ Copyright (c) 2016-present, Facebook, Inc.
4
+ Copyright (c) 2019-2021 Andrew Kane
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
12
 
13
- The above copyright notice and this permission notice shall be
14
- included in all copies or substantial portions of the Software.
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
15
 
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  [fastText](https://fasttext.cc) - efficient text classification and representation learning - for Ruby
4
4
 
5
- [![Build Status](https://travis-ci.org/ankane/fasttext.svg?branch=master)](https://travis-ci.org/ankane/fasttext)
5
+ [![Build Status](https://github.com/ankane/fastText/workflows/build/badge.svg?branch=master)](https://github.com/ankane/fastText/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -77,6 +77,12 @@ model.labels
77
77
 
78
78
  > Use `include_freq: true` to get their frequency
79
79
 
80
+ Search for the best hyperparameters
81
+
82
+ ```ruby
83
+ model.fit(x, y, autotune_set: [x_valid, y_valid])
84
+ ```
85
+
80
86
  Compress the model - significantly reduces size but sacrifices a little performance
81
87
 
82
88
  ```ruby
@@ -121,6 +127,12 @@ Get a word vector
121
127
  model.word_vector("carrot")
122
128
  ```
123
129
 
130
+ Get a sentence vector
131
+
132
+ ```ruby
133
+ model.sentence_vector("sentence text")
134
+ ```
135
+
124
136
  Get words
125
137
 
126
138
  ```ruby
@@ -166,9 +178,13 @@ FastText::Classifier.new(
166
178
  thread: 3, # number of threads
167
179
  lr_update_rate: 100, # change the rate of updates for the learning rate
168
180
  t: 0.0001, # sampling threshold
169
- label_prefix: "__label__" # label prefix
181
+ label_prefix: "__label__", # label prefix
170
182
  verbose: 2, # verbose
171
- pretrained_vectors: nil # pretrained word vectors (.vec file)
183
+ pretrained_vectors: nil, # pretrained word vectors (.vec file)
184
+ autotune_metric: "f1", # autotune optimization metric
185
+ autotune_predictions: 1, # autotune predictions
186
+ autotune_duration: 300, # autotune search time in seconds
187
+ autotune_model_size: nil # autotune model size, like 2M
172
188
  )
173
189
  ```
174
190
 
@@ -200,7 +216,7 @@ FastText::Vectorizer.new(
200
216
  Input can be read directly from files
201
217
 
202
218
  ```ruby
203
- model.fit("train.txt")
219
+ model.fit("train.txt", autotune_set: "valid.txt")
204
220
  model.test("test.txt")
205
221
  ```
206
222
 
@@ -222,7 +238,7 @@ __label__spam text from document three
222
238
 
223
239
  ## Pretrained Models
224
240
 
225
- There are a number of [pretrained models](https://fasttext.cc/docs/en/english-vectors.html) you can download
241
+ There are a number of [pretrained models](https://fasttext.cc/docs/en/supervised-models.html) you can download
226
242
 
227
243
  ### Language Identification
228
244
 
@@ -238,15 +254,6 @@ Get language predictions
238
254
  model.predict("bon appétit")
239
255
  ```
240
256
 
241
- ## rbenv
242
-
243
- This library uses [Rice](https://github.com/jasonroelofs/rice) to interface with the fastText C++ library. Rice and earlier versions of rbenv don’t play nicely together. If you encounter an error during installation, upgrade ruby-build and reinstall your Ruby version.
244
-
245
- ```sh
246
- brew upgrade ruby-build
247
- rbenv install [version]
248
- ```
249
-
250
257
  ## History
251
258
 
252
259
  View the [changelog](https://github.com/ankane/fasttext/blob/master/CHANGELOG.md)
@@ -260,12 +267,12 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
260
267
  - Write, clarify, or fix documentation
261
268
  - Suggest or add new features
262
269
 
263
- To get started with development and testing:
270
+ To get started with development:
264
271
 
265
272
  ```sh
266
- git clone https://github.com/ankane/fasttext.git
267
- cd fasttext
273
+ git clone --recursive https://github.com/ankane/fastText.git
274
+ cd fastText
268
275
  bundle install
269
- rake compile
270
- rake test
276
+ bundle exec rake compile
277
+ bundle exec rake test
271
278
  ```
data/ext/fasttext/ext.cpp CHANGED
@@ -1,116 +1,49 @@
1
- #include <args.h>
2
- #include <densematrix.h>
3
- #include <fasttext.h>
4
- #include <rice/Data_Type.hpp>
5
- #include <rice/Constructor.hpp>
6
- #include <rice/Array.hpp>
7
- #include <rice/Hash.hpp>
8
- #include <real.h>
9
- #include <vector.h>
1
+ // stdlib
10
2
  #include <cmath>
11
3
  #include <iterator>
12
4
  #include <sstream>
13
5
  #include <stdexcept>
14
6
 
15
- using namespace Rice;
7
+ // fasttext
8
+ #include <args.h>
9
+ #include <autotune.h>
10
+ #include <densematrix.h>
11
+ #include <fasttext.h>
12
+ #include <real.h>
13
+ #include <vector.h>
16
14
 
17
- template<>
18
- inline
19
- Object to_ruby<std::vector<std::pair<fasttext::real, std::string>>>(std::vector<std::pair<fasttext::real, std::string>> const & x)
20
- {
21
- Array ret;
22
- for (const auto& v : x) {
23
- Array a;
24
- a.push(v.first);
25
- a.push(v.second);
26
- ret.push(a);
27
- }
28
- return ret;
29
- }
15
+ // rice
16
+ #include <rice/rice.hpp>
17
+ #include <rice/stl.hpp>
30
18
 
31
- fasttext::Args buildArgs(Hash h) {
32
- fasttext::Args a;
19
+ using fasttext::Args;
20
+ using fasttext::FastText;
33
21
 
34
- std::vector<Hash::Entry> v;
35
- Hash::iterator it = h.begin();
36
- Hash::iterator end = h.end();
22
+ using Rice::Array;
23
+ using Rice::Constructor;
24
+ using Rice::Module;
25
+ using Rice::define_class_under;
26
+ using Rice::define_module;
27
+ using Rice::define_module_under;
37
28
 
38
- for(; it != end; ++it)
29
+ namespace Rice::detail
30
+ {
31
+ template<>
32
+ class To_Ruby<std::vector<std::pair<fasttext::real, std::string>>>
39
33
  {
40
- std::string name = from_ruby<std::string>(it->key.to_s());
41
- Object value = it->value;
42
-
43
- if (name == "input") {
44
- a.input = from_ruby<std::string>(value);
45
- } else if (name == "output") {
46
- a.output = from_ruby<std::string>(value);
47
- } else if (name == "lr") {
48
- a.lr = from_ruby<double>(value);
49
- } else if (name == "lr_update_rate") {
50
- a.lrUpdateRate = from_ruby<int>(value);
51
- } else if (name == "dim") {
52
- a.dim = from_ruby<int>(value);
53
- } else if (name == "ws") {
54
- a.ws = from_ruby<int>(value);
55
- } else if (name == "epoch") {
56
- a.epoch = from_ruby<int>(value);
57
- } else if (name == "min_count") {
58
- a.minCount = from_ruby<int>(value);
59
- } else if (name == "min_count_label") {
60
- a.minCountLabel = from_ruby<int>(value);
61
- } else if (name == "neg") {
62
- a.neg = from_ruby<int>(value);
63
- } else if (name == "word_ngrams") {
64
- a.wordNgrams = from_ruby<int>(value);
65
- } else if (name == "loss") {
66
- std::string str = from_ruby<std::string>(value);
67
- if (str == "softmax") {
68
- a.loss = fasttext::loss_name::softmax;
69
- } else if (str == "ns") {
70
- a.loss = fasttext::loss_name::ns;
71
- } else if (str == "hs") {
72
- a.loss = fasttext::loss_name::hs;
73
- } else if (str == "ova") {
74
- a.loss = fasttext::loss_name::ova;
75
- } else {
76
- throw std::invalid_argument("Unknown loss: " + str);
77
- }
78
- } else if (name == "model") {
79
- std::string str = from_ruby<std::string>(value);
80
- if (str == "supervised") {
81
- a.model = fasttext::model_name::sup;
82
- } else if (str == "skipgram") {
83
- a.model = fasttext::model_name::sg;
84
- } else if (str == "cbow") {
85
- a.model = fasttext::model_name::cbow;
86
- } else {
87
- throw std::invalid_argument("Unknown model: " + str);
34
+ public:
35
+ VALUE convert(std::vector<std::pair<fasttext::real, std::string>> const & x)
36
+ {
37
+ Array ret;
38
+ for (const auto& v : x) {
39
+ Array a;
40
+ a.push(v.first);
41
+ a.push(v.second);
42
+ ret.push(a);
88
43
  }
89
- } else if (name == "bucket") {
90
- a.bucket = from_ruby<int>(value);
91
- } else if (name == "minn") {
92
- a.minn = from_ruby<int>(value);
93
- } else if (name == "maxn") {
94
- a.maxn = from_ruby<int>(value);
95
- } else if (name == "thread") {
96
- a.thread = from_ruby<int>(value);
97
- } else if (name == "t") {
98
- a.t = from_ruby<double>(value);
99
- } else if (name == "label_prefix") {
100
- a.label = from_ruby<std::string>(value);
101
- } else if (name == "verbose") {
102
- a.verbose = from_ruby<int>(value);
103
- } else if (name == "pretrained_vectors") {
104
- a.pretrainedVectors = from_ruby<std::string>(value);
105
- } else if (name == "save_output") {
106
- a.saveOutput = from_ruby<bool>(value);
107
- // } else if (name == "seed") {
108
- // a.seed = from_ruby<int>(value);
109
- } else {
110
- throw std::invalid_argument("Unknown argument: " + name);
44
+ return ret;
111
45
  }
112
- }
113
- return a;
46
+ };
114
47
  }
115
48
 
116
49
  extern "C"
@@ -119,11 +52,68 @@ void Init_ext()
119
52
  Module rb_mFastText = define_module("FastText");
120
53
  Module rb_mExt = define_module_under(rb_mFastText, "Ext");
121
54
 
122
- define_class_under<fasttext::FastText>(rb_mExt, "Model")
123
- .define_constructor(Constructor<fasttext::FastText>())
55
+ define_class_under<Args>(rb_mExt, "Args")
56
+ .define_constructor(Constructor<Args>())
57
+ .define_attr("input", &Args::input)
58
+ .define_attr("output", &Args::output)
59
+ .define_attr("lr", &Args::lr)
60
+ .define_attr("lr_update_rate", &Args::lrUpdateRate)
61
+ .define_attr("dim", &Args::dim)
62
+ .define_attr("ws", &Args::ws)
63
+ .define_attr("epoch", &Args::epoch)
64
+ .define_attr("min_count", &Args::minCount)
65
+ .define_attr("min_count_label", &Args::minCountLabel)
66
+ .define_attr("neg", &Args::neg)
67
+ .define_attr("word_ngrams", &Args::wordNgrams)
68
+ .define_method(
69
+ "loss=",
70
+ [](Args& a, const std::string& str) {
71
+ if (str == "softmax") {
72
+ a.loss = fasttext::loss_name::softmax;
73
+ } else if (str == "ns") {
74
+ a.loss = fasttext::loss_name::ns;
75
+ } else if (str == "hs") {
76
+ a.loss = fasttext::loss_name::hs;
77
+ } else if (str == "ova") {
78
+ a.loss = fasttext::loss_name::ova;
79
+ } else {
80
+ throw std::invalid_argument("Unknown loss: " + str);
81
+ }
82
+ })
83
+ .define_method(
84
+ "model=",
85
+ [](Args& a, const std::string& str) {
86
+ if (str == "supervised") {
87
+ a.model = fasttext::model_name::sup;
88
+ } else if (str == "skipgram") {
89
+ a.model = fasttext::model_name::sg;
90
+ } else if (str == "cbow") {
91
+ a.model = fasttext::model_name::cbow;
92
+ } else {
93
+ throw std::invalid_argument("Unknown model: " + str);
94
+ }
95
+ })
96
+ .define_attr("bucket", &Args::bucket)
97
+ .define_attr("minn", &Args::minn)
98
+ .define_attr("maxn", &Args::maxn)
99
+ .define_attr("thread", &Args::thread)
100
+ .define_attr("t", &Args::t)
101
+ .define_attr("label_prefix", &Args::label)
102
+ .define_attr("verbose", &Args::verbose)
103
+ .define_attr("pretrained_vectors", &Args::pretrainedVectors)
104
+ .define_attr("save_output", &Args::saveOutput)
105
+ .define_attr("seed", &Args::seed)
106
+ .define_attr("autotune_validation_file", &Args::autotuneValidationFile)
107
+ .define_attr("autotune_metric", &Args::autotuneMetric)
108
+ .define_attr("autotune_predictions", &Args::autotunePredictions)
109
+ .define_attr("autotune_duration", &Args::autotuneDuration)
110
+ .define_attr("autotune_model_size", &Args::autotuneModelSize);
111
+
112
+ define_class_under<FastText>(rb_mExt, "Model")
113
+ .define_constructor(Constructor<FastText>())
124
114
  .define_method(
125
115
  "words",
126
- *[](fasttext::FastText& m) {
116
+ [](FastText& m) {
127
117
  std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
128
118
  std::vector<int64_t> freq = d->getCounts(fasttext::entry_type::word);
129
119
 
@@ -141,7 +131,7 @@ void Init_ext()
141
131
  })
142
132
  .define_method(
143
133
  "labels",
144
- *[](fasttext::FastText& m) {
134
+ [](FastText& m) {
145
135
  std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
146
136
  std::vector<int64_t> freq = d->getCounts(fasttext::entry_type::label);
147
137
 
@@ -159,12 +149,12 @@ void Init_ext()
159
149
  })
160
150
  .define_method(
161
151
  "test",
162
- *[](fasttext::FastText& m, const std::string filename, int32_t k) {
152
+ [](FastText& m, const std::string& filename, int32_t k) {
163
153
  std::ifstream ifs(filename);
164
154
  if (!ifs.is_open()) {
165
155
  throw std::invalid_argument("Test file cannot be opened!");
166
156
  }
167
- fasttext::Meter meter;
157
+ fasttext::Meter meter(false);
168
158
  m.test(ifs, k, 0.0, meter);
169
159
  ifs.close();
170
160
 
@@ -176,17 +166,21 @@ void Init_ext()
176
166
  })
177
167
  .define_method(
178
168
  "load_model",
179
- *[](fasttext::FastText& m, std::string s) { m.loadModel(s); })
169
+ [](FastText& m, const std::string& s) {
170
+ m.loadModel(s);
171
+ })
180
172
  .define_method(
181
173
  "save_model",
182
- *[](fasttext::FastText& m, std::string s) { m.saveModel(s); })
183
- .define_method("dimension", &fasttext::FastText::getDimension)
184
- .define_method("quantized?", &fasttext::FastText::isQuant)
185
- .define_method("word_id", &fasttext::FastText::getWordId)
186
- .define_method("subword_id", &fasttext::FastText::getSubwordId)
174
+ [](FastText& m, const std::string& s) {
175
+ m.saveModel(s);
176
+ })
177
+ .define_method("dimension", &FastText::getDimension)
178
+ .define_method("quantized?", &FastText::isQuant)
179
+ .define_method("word_id", &FastText::getWordId)
180
+ .define_method("subword_id", &FastText::getSubwordId)
187
181
  .define_method(
188
182
  "predict",
189
- *[](fasttext::FastText& m, const std::string text, int32_t k, float threshold) {
183
+ [](FastText& m, const std::string& text, int32_t k, float threshold) {
190
184
  std::stringstream ioss(text);
191
185
  std::vector<std::pair<fasttext::real, std::string>> predictions;
192
186
  m.predictLine(ioss, predictions, k, threshold);
@@ -194,27 +188,26 @@ void Init_ext()
194
188
  })
195
189
  .define_method(
196
190
  "nearest_neighbors",
197
- *[](fasttext::FastText& m, const std::string& word, int32_t k) {
191
+ [](FastText& m, const std::string& word, int32_t k) {
198
192
  return m.getNN(word, k);
199
193
  })
200
- .define_method("analogies", &fasttext::FastText::getAnalogies)
201
- .define_method("ngram_vectors", &fasttext::FastText::getNgramVectors)
194
+ .define_method("analogies", &FastText::getAnalogies)
195
+ // .define_method("ngram_vectors", &FastText::getNgramVectors)
202
196
  .define_method(
203
197
  "word_vector",
204
- *[](fasttext::FastText& m, const std::string word) {
205
- int dimension = m.getDimension();
198
+ [](FastText& m, const std::string& word) {
199
+ auto dimension = m.getDimension();
206
200
  fasttext::Vector vec = fasttext::Vector(dimension);
207
201
  m.getWordVector(vec, word);
208
- float* data = vec.data();
209
202
  Array ret;
210
- for (int i = 0; i < dimension; i++) {
211
- ret.push(data[i]);
203
+ for (size_t i = 0; i < vec.size(); i++) {
204
+ ret.push(vec[i]);
212
205
  }
213
206
  return ret;
214
207
  })
215
208
  .define_method(
216
209
  "subwords",
217
- *[](fasttext::FastText& m, const std::string word) {
210
+ [](FastText& m, const std::string& word) {
218
211
  std::vector<std::string> subwords;
219
212
  std::vector<int32_t> ngrams;
220
213
  std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
@@ -228,36 +221,40 @@ void Init_ext()
228
221
  })
229
222
  .define_method(
230
223
  "sentence_vector",
231
- *[](fasttext::FastText& m, const std::string text) {
224
+ [](FastText& m, const std::string& text) {
232
225
  std::istringstream in(text);
233
- int dimension = m.getDimension();
226
+ auto dimension = m.getDimension();
234
227
  fasttext::Vector vec = fasttext::Vector(dimension);
235
228
  m.getSentenceVector(in, vec);
236
- float* data = vec.data();
237
229
  Array ret;
238
- for (int i = 0; i < dimension; i++) {
239
- ret.push(data[i]);
230
+ for (size_t i = 0; i < vec.size(); i++) {
231
+ ret.push(vec[i]);
240
232
  }
241
233
  return ret;
242
234
  })
243
235
  .define_method(
244
236
  "train",
245
- *[](fasttext::FastText& m, Hash h) {
246
- m.train(buildArgs(h));
237
+ [](FastText& m, Args& a) {
238
+ if (a.hasAutotune()) {
239
+ fasttext::Autotune autotune(std::shared_ptr<fasttext::FastText>(&m, [](fasttext::FastText*) {}));
240
+ autotune.train(a);
241
+ } else {
242
+ m.train(a);
243
+ }
247
244
  })
248
245
  .define_method(
249
246
  "quantize",
250
- *[](fasttext::FastText& m, Hash h) {
251
- m.quantize(buildArgs(h));
247
+ [](FastText& m, Args& a) {
248
+ m.quantize(a);
252
249
  })
253
250
  .define_method(
254
251
  "supervised?",
255
- *[](fasttext::FastText& m) {
252
+ [](FastText& m) {
256
253
  return m.getArgs().model == fasttext::model_name::sup;
257
254
  })
258
255
  .define_method(
259
256
  "label_prefix",
260
- *[](fasttext::FastText& m) {
257
+ [](FastText& m) {
261
258
  return m.getArgs().label;
262
259
  });
263
260
  }
@@ -1,9 +1,7 @@
1
1
  require "mkmf-rice"
2
2
 
3
- abort "Missing stdc++" unless have_library("stdc++")
4
-
5
- # TODO use -std=c++14 when available
6
- $CXXFLAGS << " -pthread -std=c++11 -funroll-loops -O3 -march=native"
3
+ # -pthread and -O3 set by default
4
+ $CXXFLAGS << " -std=c++17 $(optflags) -funroll-loops " << with_config("optflags", "-march=native")
7
5
 
8
6
  ext = File.expand_path(".", __dir__)
9
7
  fasttext = File.expand_path("../../vendor/fastText/src", __dir__)
@@ -21,13 +21,25 @@ module FastText
21
21
  verbose: 2,
22
22
  pretrained_vectors: "",
23
23
  save_output: false,
24
- # seed: 0
24
+ seed: 0,
25
+ autotune_validation_file: "",
26
+ autotune_metric: "f1",
27
+ autotune_predictions: 1,
28
+ autotune_duration: 60 * 5,
29
+ autotune_model_size: ""
25
30
  }
26
31
 
27
- def fit(x, y = nil)
28
- input = input_path(x, y)
32
+ def fit(x, y = nil, autotune_set: nil)
33
+ input, _ref = input_path(x, y)
29
34
  @m ||= Ext::Model.new
30
- m.train(DEFAULT_OPTIONS.merge(@options).merge(input: input, model: "supervised"))
35
+ a = build_args(DEFAULT_OPTIONS)
36
+ a.input = input
37
+ a.model = "supervised"
38
+ if autotune_set
39
+ x, y = autotune_set
40
+ a.autotune_validation_file, _autotune_ref = input_path(x, y)
41
+ end
42
+ m.train(a)
31
43
  end
32
44
 
33
45
  def predict(text, k: 1, threshold: 0.0)
@@ -37,16 +49,16 @@ module FastText
37
49
  # TODO predict multiple in C++ for performance
38
50
  result =
39
51
  text.map do |t|
40
- m.predict(prep_text(t), k, threshold).map do |v|
52
+ m.predict(prep_text(t), k, threshold).to_h do |v|
41
53
  [remove_prefix(v[1]), v[0]]
42
- end.to_h
54
+ end
43
55
  end
44
56
 
45
57
  multiple ? result : result.first
46
58
  end
47
59
 
48
60
  def test(x, y = nil, k: 1)
49
- input = input_path(x, y)
61
+ input, _ref = input_path(x, y)
50
62
  res = m.test(input, k)
51
63
  {
52
64
  examples: res[0],
@@ -57,7 +69,8 @@ module FastText
57
69
 
58
70
  # TODO support options
59
71
  def quantize
60
- m.quantize({})
72
+ a = Ext::Args.new
73
+ m.quantize(a)
61
74
  end
62
75
 
63
76
  def labels(include_freq: false)
@@ -75,7 +88,7 @@ module FastText
75
88
  def input_path(x, y)
76
89
  if x.is_a?(String)
77
90
  raise ArgumentError, "Cannot pass y with file" if y
78
- x
91
+ [x, nil]
79
92
  else
80
93
  tempfile = Tempfile.new("fasttext")
81
94
  x.zip(y) do |xi, yi|
@@ -85,7 +98,7 @@ module FastText
85
98
  tempfile.write("\n")
86
99
  end
87
100
  tempfile.close
88
- tempfile.path
101
+ [tempfile.path, tempfile]
89
102
  end
90
103
  end
91
104
 
@@ -56,5 +56,15 @@ module FastText
56
56
  def m
57
57
  @m || (raise Error, "Not fit")
58
58
  end
59
+
60
+ def build_args(default_options)
61
+ a = Ext::Args.new
62
+ opts = @options.dup
63
+ default_options.each do |k, v|
64
+ a.send("#{k}=", opts.delete(k) || v)
65
+ end
66
+ raise ArgumentError, "Unknown argument: #{opts.keys.first}" if opts.any?
67
+ a
68
+ end
59
69
  end
60
70
  end